4747MISSING_GENOTYPE_VALUE = vcf_parser .MISSING_GENOTYPE_VALUE
4848Variant = vcf_parser .Variant
4949VariantCall = vcf_parser .VariantCall
50+ SampleNameEncoding = vcf_parser .SampleNameEncoding
5051
5152
5253class _ToVcfRecordCoder (coders .Coder ):
@@ -58,7 +59,6 @@ def encode(self, variant):
5859 encoded_info = self ._encode_variant_info (variant )
5960 format_keys = self ._get_variant_format_keys (variant )
6061 encoded_calls = self ._encode_variant_calls (variant , format_keys )
61-
6262 columns = [
6363 variant .reference_name ,
6464 None if variant .start is None else variant .start + 1 ,
@@ -182,15 +182,17 @@ class _VcfSource(filebasedsource.FileBasedSource):
182182
183183 DEFAULT_VCF_READ_BUFFER_SIZE = 65536 # 64kB
184184
185- def __init__ (self ,
186- file_pattern , # type: str
187- representative_header_lines = None , # type: List[str]
188- compression_type = CompressionTypes .AUTO , # type: str
189- buffer_size = DEFAULT_VCF_READ_BUFFER_SIZE , # type: int
190- validate = True , # type: bool
191- allow_malformed_records = False , # type: bool
192- pre_infer_headers = False , # type: bool
193- ):
185+ def __init__ (
186+ self ,
187+ file_pattern , # type: str
188+ representative_header_lines = None , # type: List[str]
189+ compression_type = CompressionTypes .AUTO , # type: str
190+ buffer_size = DEFAULT_VCF_READ_BUFFER_SIZE , # type: int
191+ validate = True , # type: bool
192+ allow_malformed_records = False , # type: bool
193+ pre_infer_headers = False , # type: bool
194+ sample_name_encoding = SampleNameEncoding .WITHOUT_FILE_PATH # type: int
195+ ):
194196 # type: (...) -> None
195197 super (_VcfSource , self ).__init__ (file_pattern ,
196198 compression_type = compression_type ,
@@ -200,6 +202,8 @@ def __init__(self,
200202 self ._buffer_size = buffer_size
201203 self ._allow_malformed_records = allow_malformed_records
202204 self ._pre_infer_headers = pre_infer_headers
205+ self ._sample_name_encoding = sample_name_encoding
206+
203207
204208 def read_records (self ,
205209 file_name , # type: str
@@ -214,6 +218,7 @@ def read_records(self,
214218 file_pattern = self ._pattern ,
215219 representative_header_lines = self ._representative_header_lines ,
216220 pre_infer_headers = self ._pre_infer_headers ,
221+ sample_name_encoding = self ._sample_name_encoding ,
217222 buffer_size = self ._buffer_size ,
218223 skip_header_lines = 0 )
219224
@@ -229,7 +234,9 @@ def __init__(self,
229234 input_files ,
230235 representative_header_lines ,
231236 allow_malformed_records ,
232- pre_infer_headers ):
237+ pre_infer_headers ,
238+ sample_name_encoding = SampleNameEncoding .WITHOUT_FILE_PATH
239+ ):
233240 # type: (List[str], List[str], bool) -> None
234241 """Initializes the transform.
235242
@@ -239,11 +246,16 @@ def __init__(self,
239246 VCF files.
240247 allow_malformed_records: If true, malformed records from VCF files will be
241248 returned as `MalformedVcfRecord` instead of failing the pipeline.
249+ pre_infer_headers: If true, drop headers and make sure PySam return the
250+ exact data for variants and calls, without type matching.
251+ sample_name_encoding: specify how we want to encode sample_name mainly
252+ to deal with same sample_name used across multiple VCF files.
242253 """
243254 self ._input_files = input_files
244255 self ._representative_header_lines = representative_header_lines
245256 self ._allow_malformed_records = allow_malformed_records
246257 self ._pre_infer_headers = pre_infer_headers
258+ self ._sample_name_encoding = sample_name_encoding
247259
248260 def _read_records (self , (file_path , block )):
249261 # type: (Tuple[str, Block]) -> Iterable(Variant)
@@ -255,7 +267,8 @@ def _read_records(self, (file_path, block)):
255267 self ._allow_malformed_records ,
256268 representative_header_lines = self ._representative_header_lines ,
257269 splittable_bgzf = True ,
258- pre_infer_headers = self ._pre_infer_headers )
270+ pre_infer_headers = self ._pre_infer_headers ,
271+ sample_name_encoding = self ._sample_name_encoding )
259272
260273 for record in record_iterator :
261274 yield record
@@ -286,6 +299,7 @@ def __init__(
286299 validate = True , # type: bool
287300 allow_malformed_records = False , # type: bool
288301 pre_infer_headers = False , # type: bool
302+ sample_name_encoding = SampleNameEncoding .WITHOUT_FILE_PATH , # type: int
289303 ** kwargs # type: **str
290304 ):
291305 # type: (...) -> None
@@ -302,6 +316,10 @@ def __init__(
302316 underlying file_path's extension will be used to detect the compression.
303317 validate: flag to verify that the files exist during the pipeline creation
304318 time.
319+ pre_infer_headers: If true, drop headers and make sure PySam return the
320+ exact data for variants and calls, without type matching.
321+ sample_name_encoding: specify how we want to encode sample_name mainly
322+ to deal with same sample_name used across multiple VCF files
305323 """
306324 super (ReadFromVcf , self ).__init__ (** kwargs )
307325
@@ -311,20 +329,23 @@ def __init__(
311329 compression_type ,
312330 validate = validate ,
313331 allow_malformed_records = allow_malformed_records ,
314- pre_infer_headers = pre_infer_headers )
332+ pre_infer_headers = pre_infer_headers ,
333+ sample_name_encoding = sample_name_encoding )
315334
316335 def expand (self , pvalue ):
317336 return pvalue .pipeline | Read (self ._source )
318337
319338
320339def _create_vcf_source (
321340 file_pattern = None , representative_header_lines = None , compression_type = None ,
322- allow_malformed_records = None , pre_infer_headers = False ):
341+ allow_malformed_records = None , pre_infer_headers = False ,
342+ sample_name_encoding = SampleNameEncoding .WITHOUT_FILE_PATH ):
323343 return _VcfSource (file_pattern = file_pattern ,
324344 representative_header_lines = representative_header_lines ,
325345 compression_type = compression_type ,
326346 allow_malformed_records = allow_malformed_records ,
327- pre_infer_headers = pre_infer_headers )
347+ pre_infer_headers = pre_infer_headers ,
348+ sample_name_encoding = sample_name_encoding )
328349
329350
330351class ReadAllFromVcf (PTransform ):
@@ -348,6 +369,7 @@ def __init__(
348369 compression_type = CompressionTypes .AUTO , # type: str
349370 allow_malformed_records = False , # type: bool
350371 pre_infer_headers = False , # type: bool
372+ sample_name_encoding = SampleNameEncoding .WITHOUT_FILE_PATH , # type: int
351373 ** kwargs # type: **str
352374 ):
353375 # type: (...) -> None
@@ -366,14 +388,19 @@ def __init__(
366388 underlying file_path's extension will be used to detect the compression.
367389 allow_malformed_records: If true, malformed records from VCF files will be
368390 returned as :class:`MalformedVcfRecord` instead of failing the pipeline.
391+ pre_infer_headers: If true, drop headers and make sure PySam return the
392+ exact data for variants and calls, without type matching.
393+ sample_name_encoding: specify how we want to encode sample_name mainly
394+ to deal with same sample_name used across multiple VCF files
369395 """
370396 super (ReadAllFromVcf , self ).__init__ (** kwargs )
371397 source_from_file = partial (
372398 _create_vcf_source ,
373399 representative_header_lines = representative_header_lines ,
374400 compression_type = compression_type ,
375401 allow_malformed_records = allow_malformed_records ,
376- pre_infer_headers = pre_infer_headers )
402+ pre_infer_headers = pre_infer_headers ,
403+ sample_name_encoding = sample_name_encoding )
377404 self ._read_all_files = filebasedsource .ReadAllFiles (
378405 True , # splittable
379406 CompressionTypes .AUTO , desired_bundle_size ,
0 commit comments