@@ -2028,6 +2028,10 @@ def haplotypes(self, samples=None, sites=None):
20282028 ``None``, return haplotypes for all sample nodes, otherwise this may be a
20292029 numpy array (or array-like) object (converted to dtype=np.int32).
20302030 :param array sites: A numpy array of sites to use.
2031+
2032+
2033+ :return: An iterator returning sucessive instances of (sample_id, haplotype).
2034+ :rtype: iter(int, numpy.ndarray(dtype=int8))
20312035 """
20322036 if samples is None :
20332037 samples = np .arange (self .num_samples )
@@ -2120,6 +2124,7 @@ class Ancestor:
21202124 time = attr .ib ()
21212125 focal_sites = attr .ib ()
21222126 haplotype = attr .ib ()
2127+ sample_id = attr .ib ()
21232128
21242129
21252130class AncestorData (DataContainer ):
@@ -2157,7 +2162,7 @@ class AncestorData(DataContainer):
21572162 """
21582163
21592164 FORMAT_NAME = "tsinfer-ancestor-data"
2160- FORMAT_VERSION = (3 , 0 )
2165+ FORMAT_VERSION = (3 , 1 )
21612166
21622167 def __init__ (self , sample_data , ** kwargs ):
21632168 super ().__init__ (** kwargs )
@@ -2218,6 +2223,13 @@ def __init__(self, sample_data, **kwargs):
22182223 compressor = self ._compressor ,
22192224 fill_value = None ,
22202225 )
2226+ self .data .create_dataset (
2227+ "ancestors/sample_id" ,
2228+ shape = (0 ,),
2229+ chunks = chunks ,
2230+ compressor = self ._compressor ,
2231+ dtype = np .int32 ,
2232+ )
22212233
22222234 self ._alloc_ancestor_writer ()
22232235
@@ -2233,6 +2245,7 @@ def _alloc_ancestor_writer(self):
22332245 "time" : self .ancestors_time ,
22342246 "focal_sites" : self .ancestors_focal_sites ,
22352247 "haplotype" : self .ancestors_haplotype ,
2248+ "sample_id" : self .ancestors_sample_id ,
22362249 },
22372250 num_threads = self ._num_flush_threads ,
22382251 )
@@ -2254,6 +2267,7 @@ def __str__(self):
22542267 ("ancestors/time" , zarr_summary (self .ancestors_time )),
22552268 ("ancestors/focal_sites" , zarr_summary (self .ancestors_focal_sites )),
22562269 ("ancestors/haplotype" , zarr_summary (self .ancestors_haplotype )),
2270+ ("ancestors/sample_id" , zarr_summary (self .ancestors_sample_id )),
22572271 ]
22582272 return super ().__str__ () + self ._format_str (values )
22592273
@@ -2278,6 +2292,9 @@ def data_equal(self, other):
22782292 self .ancestors_focal_sites [:], other .ancestors_focal_sites [:]
22792293 )
22802294 and np_obj_equal (self .ancestors_haplotype [:], other .ancestors_haplotype [:])
2295+ and np .array_equal (
2296+ self .ancestors_sample_id [:], other .ancestors_sample_id [:]
2297+ )
22812298 )
22822299
22832300 @property
@@ -2320,6 +2337,10 @@ def ancestors_focal_sites(self):
23202337 def ancestors_haplotype (self ):
23212338 return self .data ["ancestors/haplotype" ]
23222339
2340+ @property
2341+ def ancestors_sample_id (self ):
2342+ return self .data ["ancestors/sample_id" ]
2343+
23232344 @property
23242345 def ancestors_length (self ):
23252346 """
@@ -2338,6 +2359,7 @@ def insert_proxy_samples(
23382359 * ,
23392360 sample_ids = None ,
23402361 epsilon = None ,
2362+ map_ancestors = False ,
23412363 allow_mutation = False ,
23422364 require_same_sample_data = True ,
23432365 ** kwargs ,
@@ -2350,7 +2372,8 @@ def insert_proxy_samples(
23502372
23512373 A *proxy sample ancestor* is an ancestor based upon a known sample. At
23522374 sites used in the full inference process, the haplotype of this ancestor
2353- is identical to that of the sample on which it is based. The time of the
2375+ is identical to that of the sample on which it is based, and the
2376+ The time of the
23542377 ancestor is taken to be a fraction ``epsilon`` older than the sample on
23552378 which it is based.
23562379
@@ -2364,11 +2387,11 @@ def insert_proxy_samples(
23642387
23652388 .. note::
23662389
2367- The proxy sample ancestors inserted here will correspond to extra nodes
2368- in the inferred tree sequence. At sites which are not used in the full
2390+ The proxy sample ancestors inserted here will end up as extra nodes
2391+ in the inferred tree sequence, but at sites which are not used in the full
23692392 inference process (e.g. sites unique to a single historical sample),
2370- these proxy sample ancestor nodes may have a different genotype from
2371- their corresponding sample.
2393+ it is possible for these proxy sample ancestor nodes to have a different
2394+ genotype from their corresponding sample.
23722395
23732396 :param SampleData sample_data: The :class:`.SampleData` instance
23742397 from which to select the samples used to create extra ancestors.
@@ -2403,7 +2426,8 @@ def insert_proxy_samples(
24032426 to ensure that the encoding of alleles in ``sample_data`` matches the
24042427 encoding in the current :class:`AncestorData` instance (i.e. that in the
24052428 original :class:`.SampleData` instance on which the current ancestors
2406- are based).
2429+ are based). Note that in this case, the sample_id is not recorded in the
2430+ returned object.
24072431 :param \\ **kwargs: Further arguments passed to the constructor when creating
24082432 the new :class:`AncestorData` instance which will be returned.
24092433
@@ -2501,7 +2525,11 @@ def insert_proxy_samples(
25012525 time = proxy_time ,
25022526 focal_sites = [],
25032527 haplotype = haplotype ,
2528+ sample_id = sample_id
2529+ if sample_data .uuid == self .sample_data_uuid
2530+ else tskit .NULL ,
25042531 )
2532+
25052533 # Add any ancestors remaining in the current instance
25062534 while ancestor is not None :
25072535 other .add_ancestor (** attr .asdict (ancestor , filter = exclude_id ))
@@ -2583,7 +2611,6 @@ def truncate_ancestors(
25832611 start = self .ancestors_start [:]
25842612 end = self .ancestors_end [:]
25852613 time = self .ancestors_time [:]
2586- focal_sites = self .ancestors_focal_sites [:]
25872614 haplotypes = self .ancestors_haplotype [:]
25882615 if upper_time_bound > np .max (time ) or lower_time_bound > np .max (time ):
25892616 raise ValueError ("Time bounds cannot be greater than older ancestor" )
@@ -2621,16 +2648,12 @@ def truncate_ancestors(
26212648 )
26222649 start [anc .id ] = insert_pos_start
26232650 end [anc .id ] = insert_pos_end
2624- time [anc .id ] = anc .time
2625- focal_sites [anc .id ] = anc .focal_sites
26262651 haplotypes [anc .id ] = anc .haplotype [
26272652 insert_pos_start - anc .start : insert_pos_end - anc .start
26282653 ]
26292654 # TODO - record truncation in ancestors' metadata when supported
26302655 truncated .ancestors_start [:] = start
26312656 truncated .ancestors_end [:] = end
2632- truncated .ancestors_time [:] = time
2633- truncated .ancestors_focal_sites [:] = focal_sites
26342657 truncated .ancestors_haplotype [:] = haplotypes
26352658 truncated .record_provenance (command = "truncate_ancestors" )
26362659 truncated .finalise ()
@@ -2651,6 +2674,12 @@ def set_inference_sites(self, site_ids):
26512674 sites in the sample data file, and the IDs must be in increasing order.
26522675
26532676 This must be called before the first call to :meth:`.add_ancestor`.
2677+
2678+ .. note::
2679+ To obtain a list of which sites in a sample data or a tree sequence have
2680+ been placed into the ancestors file for use in inference, you can apply
2681+ :func:`numpy.isin` to the list of positions, e.g.
2682+ ``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
26542683 """
26552684 self ._check_build_mode ()
26562685 position = self .sample_data .sites_position [:][site_ids ]
@@ -2659,12 +2688,18 @@ def set_inference_sites(self, site_ids):
26592688 array [:] = position
26602689 self ._num_alleles = self .sample_data .num_alleles (site_ids )
26612690
2662- def add_ancestor (self , start , end , time , focal_sites , haplotype ):
2691+ def add_ancestor (
2692+ self , start , end , time , focal_sites , haplotype , sample_id = tskit .NULL
2693+ ):
26632694 """
26642695 Adds an ancestor with the specified haplotype, with ancestral material over the
26652696 interval [start:end], that is associated with the specified timepoint and has new
2666- mutations at the specified list of focal sites. Ancestors should be added in time
2667- order, with the oldest first. The id of the added ancestor is returned.
2697+ mutations at the specified list of focal sites. If this ancestor is based on a
2698+ specific sample from the associated sample_data file (i.e. a historical sample)
2699+ then the ``sample_id`` in the sample data file can also be passed as a parameter.
2700+
2701+ The Ancestors should be added in time order, with the oldest first. The id of
2702+ the added ancestor is returned.
26682703 """
26692704 self ._check_build_mode ()
26702705 haplotype = tskit .util .safe_np_int_cast (haplotype , dtype = np .int8 , copy = True )
@@ -2694,6 +2729,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
26942729 time = time ,
26952730 focal_sites = focal_sites ,
26962731 haplotype = haplotype ,
2732+ sample_id = sample_id ,
26972733 )
26982734
26992735 def finalise (self ):
@@ -2715,6 +2751,7 @@ def ancestors(self):
27152751 end = self .ancestors_end [:]
27162752 time = self .ancestors_time [:]
27172753 focal_sites = self .ancestors_focal_sites [:]
2754+ sample_id = self .ancestors_sample_id [:]
27182755 for j , h in enumerate (chunk_iterator (self .ancestors_haplotype )):
27192756 yield Ancestor (
27202757 id = j ,
@@ -2723,6 +2760,7 @@ def ancestors(self):
27232760 time = time [j ],
27242761 focal_sites = focal_sites [j ],
27252762 haplotype = h ,
2763+ sample_id = sample_id [j ],
27262764 )
27272765
27282766
0 commit comments