Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,10 @@

- The `FixedPedigree` simulation model now supports internal samples ({issue}`1855`,
{pr}`2321`, {pr}`2326`, {pr}`2388`, {user}`abureau`, {user}`jeromekelleher`).

- Add support and wheels for Python3.13

- Support for simulations after local mrca,
({issue}`2157`, {pr}`2396`, {user}`jeromekelleher`, {user}`hossam26644`).
- Drop Python 3.9 support, require Python >= 3.10 ({pr}`2418`, {user}`benjeffery`)

- Add wheels on Windows ({pr}`2414`, {issue}`2200`,{user}`benjeffery`)

**Breaking changes**:
Expand Down
79 changes: 48 additions & 31 deletions algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


logger = daiquiri.getLogger()
INFINITY = sys.float_info.max


class FenwickTree:
Expand Down Expand Up @@ -247,7 +248,7 @@ def _get_common_ancestor_waiting_time(self, np, t):
Returns the random waiting time until a common ancestor event
occurs within this population.
"""
ret = sys.float_info.max
ret = INFINITY
u = random.expovariate(2 * np)
if self.growth_rate == 0:
ret = self.start_size * u
Expand All @@ -267,7 +268,7 @@ def _get_common_ancestor_waiting_time(self, np, t):
def get_common_ancestor_waiting_time_hudson(self):
def _get_common_ancestor_waiting_time_hudson(t):
k = self.get_num_ancestors()
ret = sys.float_info.max
ret = INFINITY
if k > 1:
np = k * (k - 1) / 2
ret = self._get_common_ancestor_waiting_time(np, t)
Expand All @@ -278,7 +279,7 @@ def _get_common_ancestor_waiting_time_hudson(t):
def get_common_ancestor_waiting_time_smc_k(self):
def _get_common_ancestor_waiting_time_smc_k(t):
np = self.get_num_pairs()
ret = sys.float_info.max
ret = INFINITY
if np > 0:
ret = self._get_common_ancestor_waiting_time(np, t)
return ret
Expand Down Expand Up @@ -885,6 +886,7 @@ def __init__(
gene_conversion_length=1,
discrete_genome=True,
hull_offset=None,
stop_at_local_mrca=True,
):
# Must be a square matrix.
N = len(migration_matrix)
Expand All @@ -906,6 +908,7 @@ def __init__(
self.migration_matrix = migration_matrix
self.num_labels = num_labels
self.num_populations = N
self.stop_at_local_mrca = stop_at_local_mrca
self.max_segments = max_segments
self.coalescing_segments_only = coalescing_segments_only
self.additional_nodes = msprime.NodeType(additional_nodes)
Expand Down Expand Up @@ -964,7 +967,7 @@ def __init__(
self.sweep_trajectory = sweep_trajectory
self.time_slice = time_slice

self.modifier_events = [(sys.float_info.max, None, None)]
self.modifier_events = [(INFINITY, None, None)]
for time, pop_id, new_size in population_size_changes:
self.modifier_events.append(
(time, self.change_population_size, (int(pop_id), new_size))
Expand Down Expand Up @@ -1046,13 +1049,6 @@ def initialise(self, ts):
seg = seg.next
self.add_lineage(lineage)

def ancestors_remain(self):
"""
Returns True if the simulation is not finished, i.e., there is some ancestral
material that has not fully coalesced.
"""
return sum(pop.get_num_ancestors() for pop in self.P) != 0

def change_population_size(self, pop_id, size):
self.P[pop_id].set_start_size(size)

Expand Down Expand Up @@ -1253,7 +1249,7 @@ def simulate(self, end_time):
if self.model == "hudson":
self.hudson_simulate(end_time)
elif self.model == "dtwf":
self.dtwf_simulate()
self.dtwf_simulate(end_time)
elif self.model == "fixed_pedigree":
self.pedigree_simulate()
elif self.model == "single_sweep":
Expand Down Expand Up @@ -1311,46 +1307,51 @@ def find_cleft_individual(self, label, cleft_value):
individual_index -= num_ancestors
raise AssertionError()

def is_completed(self):
for x in self.S.values():
if x > 1:
return False
return True

def hudson_simulate(self, end_time):
"""
Simulates the algorithm until all loci have coalesced.
"""
infinity = sys.float_info.max
non_empty_pops = {pop.id for pop in self.P if pop.get_num_ancestors() > 0}
potential_destinations = self.get_potential_destinations()

# only worried about label 0 below
while len(non_empty_pops) > 0:
while not self.is_completed():
self.verify()
if self.t >= end_time:
break
# self.print_state()
re_rate = self.get_total_recombination_rate(label=0)
t_re = infinity
t_re = INFINITY
if re_rate > 0:
t_re = random.expovariate(re_rate)

# Gene conversion can occur within segments ..
gc_rate = self.get_total_gc_rate(label=0)
t_gcin = infinity
t_gcin = INFINITY
if gc_rate > 0:
t_gcin = random.expovariate(gc_rate)
# ... or to the left of the first segment.
gc_left_rate = self.get_total_gc_left_rate(label=0)
t_gc_left = infinity
t_gc_left = INFINITY
if gc_left_rate > 0:
t_gc_left = random.expovariate(gc_left_rate)

# Common ancestor events occur within demes.
t_ca = infinity
t_ca = INFINITY
for index in non_empty_pops:
pop = self.P[index]
assert pop.get_num_ancestors() > 0
t = pop.get_common_ancestor_waiting_time(self.t)
if t < t_ca:
t_ca = t
ca_population = index
t_mig = infinity
t_mig = INFINITY
# Migration events happen at the rates in the matrix.
for j in non_empty_pops:
source_size = self.P[j].get_num_ancestors()
Expand All @@ -1365,7 +1366,7 @@ def hudson_simulate(self, end_time):
mig_source = j
mig_dest = k
min_time = min(t_re, t_ca, t_gcin, t_gc_left, t_mig)
assert min_time != infinity
assert min_time != INFINITY
if self.t + min_time > self.modifier_events[0][0]:
t, func, args = self.modifier_events.pop(0)
self.t = t
Expand Down Expand Up @@ -1400,6 +1401,7 @@ def hudson_simulate(self, end_time):
non_empty_pops.remove(mig_source)
assert self.P[mig_dest].get_num_ancestors() > 0
non_empty_pops.add(mig_dest)

logger.info(
"%s time=%f n=%d",
event,
Expand Down Expand Up @@ -1441,7 +1443,7 @@ def single_sweep_simulate(self):
# main loop time
t_inc_orig = self.time_slice
e_time = 0.0
while self.ancestors_remain() and sweep_traj_step < len(times) - 1:
while sweep_traj_step < len(times) - 1 and not self.is_completed():
self.verify()
event_prob = 1.0
while event_prob > random.random() and sweep_traj_step < len(times) - 1:
Expand Down Expand Up @@ -1524,13 +1526,16 @@ def pedigree_simulate(self):
self.pedigree = Pedigree(self.tables)
self.dtwf_climb_pedigree()

def dtwf_simulate(self):
def dtwf_simulate(self, end_time):
"""
Simulates the algorithm until all loci have coalesced.
"""
while self.ancestors_remain():
self.t += 1
while not self.is_completed():
self.verify()
if self.t >= end_time:
break
self.t += 1
# print("DTWF", self.t)
self.dtwf_generation()

def dtwf_generation(self):
Expand Down Expand Up @@ -2256,13 +2261,14 @@ def merge_ancestors(self, H, pop_id, label, new_node_id=-1):
if r_max not in self.S:
j = self.S.floor_key(r_max)
self.S[r_max] = self.S[j]
min_overlap = len(X) if self.stop_at_local_mrca else 0
# Update the number of extant segments.
if self.S[left] == len(X):
if self.S[left] == min_overlap:
self.S[left] = 0
right = self.S.succ_key(left)
else:
right = left
while right < r_max and self.S[right] != len(X):
while right < r_max and self.S[right] != min_overlap:
self.S[right] -= len(X) - 1
right = self.S.succ_key(right)
alpha = self.alloc_segment(left, right, new_node_id, pop_id)
Expand Down Expand Up @@ -2394,6 +2400,7 @@ def merge_two_ancestors(self, population_index, label, x, y, u=-1):
new_lineage = self.alloc_lineage(None, population_index, label=label)
coalescence = False
defrag_required = False
min_overlap = 2 if self.stop_at_local_mrca else 0

while x is not None or y is not None:
alpha = None
Expand Down Expand Up @@ -2435,12 +2442,12 @@ def merge_two_ancestors(self, population_index, label, x, y, u=-1):
j = self.S.floor_key(r_max)
self.S[r_max] = self.S[j]
# Update the number of extant segments.
if self.S[left] == 2:
if self.S[left] == min_overlap:
self.S[left] = 0
right = self.S.succ_key(left)
else:
right = left
while right < r_max and self.S[right] != 2:
while right < r_max and self.S[right] != min_overlap:
self.S[right] -= 1
right = self.S.succ_key(right)
alpha = self.alloc_segment(
Expand Down Expand Up @@ -2732,10 +2739,10 @@ def verify(self):
Checks that the state of the simulator is consistent.
"""
self.verify_segments()
if self.model not in ["fixed_pedigree", "dtwf"]:
# The fixed_pedigree model doesn't maintain a bunch of stuff.
# It would probably be simpler if it did.
# The fixed_pedigree model doesn't maintain a bunch of stuff.
if self.model != "fixed_pedigree":
self.verify_overlaps()
if self.model not in ["fixed_pedigree", "dtwf"]:
for label in range(self.num_labels):
if self.recomb_mass_index is None:
assert self.recomb_map.total_mass == 0
Expand Down Expand Up @@ -2860,6 +2867,7 @@ def run_simulate(args):
gene_conversion_length=mean_tract_length,
discrete_genome=args.discrete,
hull_offset=args.offset,
stop_at_local_mrca=not args.continue_after_local_mrca,
)
ts = s.simulate(args.end_time)
ts.dump(args.output_file)
Expand Down Expand Up @@ -2946,6 +2954,15 @@ def add_simulator_arguments(parser):
help="The delta_t value for selective sweeps",
)
parser.add_argument("--model", default="hudson")
parser.add_argument(
"--continue-after-local-mrca",
action="store_true",
default=False,
help=(
"If set, continue after local MRCA (i.e., do not stop). "
"Default: False (stop at local MRCA)."
),
)
parser.add_argument("--offset", type=float, default=0.0)
parser.add_argument(
"--from-ts",
Expand Down
69 changes: 68 additions & 1 deletion docs/ancestry.md
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,7 @@ However, if `coalescing_segments_only=False`, then the edges recorded
would be from `m` to `p` over the entire segment `[a, b)`, and from `n` to `p` over the entire segment `[c,d)`.
The nodes `m` and `n` coalesce (in `p`) on only the overlapping segment `[c,b)`, and so the node `p` will be a unary node in the flanking regions: above `m` on the segment `[a,c)` and above `n` on the segment `[b,d)`.


(sec_additional_nodes_ca)=

### Common ancestor events
Expand Down Expand Up @@ -1480,6 +1481,38 @@ Migrations nodes are also recorded in the ARG using the
`NodeType.MIGRANT` flag. See the {ref}`sec_api_node_flags`
section for more details.

(sec_ancestry_stop_at_local_mrca)=

### Simulating ancestral to local MRCAs

By default, msprime stops simulating local trees when a local most recent
common ancestor (MRCA) is found. This is because events that occur above the
common ancestor are shared across all samples, and we are usually interested in
differences between samples. However, it is sometime useful to simulate ancestry
across the full span of the region until **all** local MRCAs have been reached.
To do this, we set the parameter `stop_at_local_mrca` of
{func}`.sim_ancestry` to `False`.

```{code-cell}
ts = msprime.sim_ancestry(
2, recombination_rate=0.1, sequence_length=10, random_seed=75,
stop_at_local_mrca=False, record_full_arg=True)
ts.draw_svg(time_scale='rank')
```

In this example we've continued the simulation *after* we've found roots for the local
trees, and we can see, for example, that node 14 is still the ancestor of 11
in the rightmost tree. This would be not recorded with the default ``stop_at_local_mrca=False``
because 11 is the root of that tree.

The stopping condition remains the same---we keep simulating until an MRCA has been
found at *all* local trees. The interpretation of the the final "root" nodes
across the trees is a little subtle, and follows the same logic as used when
{ref}`sec_ancestry_end_time`. All of the trees have a root node with time
equal to the oldest MRCA, and the nodes correspond to different lineages present
in the simulation.


## Manipulating simulation time

(sec_ancestry_end_time)=
Expand All @@ -1493,7 +1526,7 @@ for two diploid samples:

```{code-cell}
ts = msprime.sim_ancestry(2, sequence_length=10, recombination_rate=0.1, random_seed=42)
SVG(ts.draw_svg(y_axis=True, y_ticks=[0, 1, 2, 3, 4, 5]))
SVG(ts.draw_svg(y_axis=True, y_ticks=[0, 1, 2, 3, 4]))
```

Sometimes we would like to stop the simulation early, **before** complete
Expand Down Expand Up @@ -2780,6 +2813,39 @@ Because the input pedigree fully describes the simulation many features such as
combined with other {ref}`ancestry models<sec_ancestry_models>`.
:::

(sec_ancestry_models_fixed_pedigree_tracing)=

#### Tracing ancestry through a pedigree

The previous example generated trees that are embedded within the pedigree,
but doesn't tell us directly the exact path that each sample's genetic
material took through the ancestors. To trace these paths exactly, we can
use a combination of the ``additional_nodes``,
``coalescing_segments_only``
(see {ref}`sec_ancestry_additional_nodes`),
and ``stop_at_local_mrca`` (see {ref}`sec_ancestry_stop_at_local_mrca`).

```{code-cell}
ped_ts = msprime.sim_ancestry(
initial_state=pedigree,
model="fixed_pedigree",
random_seed=41,
recombination_rate=0.001,
coalescing_segments_only=False,
additional_nodes=(
msprime.NodeType.COMMON_ANCESTOR |
msprime.NodeType.PASS_THROUGH),
stop_at_local_mrca=True
)
node_labels = {node.id: f"{node.individual}({node.id})" for node in ped_ts.nodes()}
SVG(ped_ts.draw_svg(y_axis=True, node_labels=node_labels, size=(600,200)))
```

We can now see that the ancestor of node 15 is 20, which we would have to
infer in the previous example.



#### Censoring pedigree information

:::{warning}
Expand Down Expand Up @@ -2883,6 +2949,7 @@ when calculating population sizes. Thus, the pedigree must be seen as
entirely **external** to the population model simulated during recapitation.
:::


(sec_ancestry_models_fixed_pedigree_demography)=

#### Pedigrees and demography
Expand Down
Loading