googlegenomics
diff --git a/‎gcp_variant_transforms/beam_io/vcf_parser.py‎
Lines changed: 52 additions & 23 deletions b/‎gcp_variant_transforms/beam_io/vcf_parser.py‎
Lines changed: 52 additions & 23 deletions
diff --git a/‎gcp_variant_transforms/beam_io/vcfio.py‎
Lines changed: 43 additions & 16 deletions b/‎gcp_variant_transforms/beam_io/vcfio.py‎
Lines changed: 43 additions & 16 deletions
@@ -20,6 +20,7 @@
 from __future__ import absolute_import
 
 from collections import namedtuple
+import enum
 from typing import Iterable  # pylint: disable=unused-import
 import logging
 import os
@@ -30,6 +31,7 @@
 from pysam import libcbcf
 
 from gcp_variant_transforms.beam_io import bgzf
+from gcp_variant_transforms.libs import hashing_util
 
 # Stores data about failed VCF record reads. `line` is the text line that
 # caused the failed read and `file_name` is the name of the file that the read
@@ -55,6 +57,13 @@
 INFO_HEADER_TAG = '##INFO'
 LAST_HEADER_LINE_PREFIX = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
 
+
+class SampleNameEncoding(enum.Enum):
+  """An Enum specifying the way we encode sample_name."""
+  WITHOUT_FILE_PATH = 0
+  WITH_FILE_PATH = 1
+
+
 class Variant(object):
   """A class to store info about a genomic variant.
 
@@ -244,24 +253,27 @@ class VcfParser(object):
   ```
   """
 
-  def __init__(self,
-               file_name,  # type: str
-               range_tracker,  # type: range_trackers.OffsetRangeTracker
-               file_pattern,  # type: str
-               compression_type,  # type: str
-               allow_malformed_records,  # type: bool
-               representative_header_lines=None,  # type:  List[str]
-               splittable_bgzf=False,  # type: bool
-               pre_infer_headers=False,  # type: bool
-               **kwargs  # type: **str
-              ):
+  def __init__(
+      self,
+      file_name,  # type: str
+      range_tracker,  # type: range_trackers.OffsetRangeTracker
+      file_pattern,  # type: str
+      compression_type,  # type: str
+      allow_malformed_records,  # type: bool
+      representative_header_lines=None,  # type:  List[str]
+      splittable_bgzf=False,  # type: bool
+      pre_infer_headers=False,  # type: bool
+      sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,  # type: int
+      **kwargs  # type: **str
+      ):
     # type: (...) -> None
     # If `representative_header_lines` is given, header lines in `file_name`
     # are ignored; refer to _process_header_lines() logic.
     self._representative_header_lines = representative_header_lines
     self._file_name = file_name
     self._allow_malformed_records = allow_malformed_records
     self._pre_infer_headers = pre_infer_headers
+    self._sample_name_encoding = sample_name_encoding
 
     if splittable_bgzf:
       text_source = bgzf.BGZFBlockSource(
@@ -405,17 +417,19 @@ class PySamParser(VcfParser):
   class - we could only use a single pipe, but it will divert the parsers.
   """
 
-  def __init__(self,
-               file_name,  # type: str
-               range_tracker,  # type: range_trackers.OffsetRangeTracker
-               compression_type,  # type: str
-               allow_malformed_records,  # type: bool
-               file_pattern=None,  # type: str
-               representative_header_lines=None,  # type:  List[str]
-               splittable_bgzf=False,  # type: bool
-               pre_infer_headers=False,  # type: bool
-               **kwargs  # type: **str
-              ):
+  def __init__(
+      self,
+      file_name,  # type: str
+      range_tracker,  # type: range_trackers.OffsetRangeTracker
+      compression_type,  # type: str
+      allow_malformed_records,  # type: bool
+      file_pattern=None,  # type: str
+      representative_header_lines=None,  # type:  List[str]
+      splittable_bgzf=False,  # type: bool
+      pre_infer_headers=False,  # type: bool
+      sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,  # type: int
+      **kwargs  # type: **str
+      ):
     # type: (...) -> None
     super(PySamParser, self).__init__(file_name,
                                       range_tracker,
@@ -425,12 +439,15 @@ def __init__(self,
                                       representative_header_lines,
                                       splittable_bgzf,
                                       pre_infer_headers,
+                                      sample_name_encoding,
                                       **kwargs)
     # These members will be properly initiated in _init_parent_process().
     self._vcf_reader = None
     self._to_child = None
     self._original_info_list = None
     self._process_pid = None
+    self._encoded_sample_names = {}
+    self._sample_name_encoding = sample_name_encoding
 
   def send_kill_signal_to_child(self):
     self._to_child.write('\n')
@@ -592,6 +609,17 @@ def _convert_field(self, value, is_phaseset=False):
       value = value.encode('utf-8')
     return str(value)
 
+  def _lookup_encoded_sample_name(self, sample_name):
+    sample_id = self._encoded_sample_names.get(sample_name)
+    if not sample_id:
+      if self._sample_name_encoding == SampleNameEncoding.WITH_FILE_PATH:
+        sample_id = hex(hashing_util.generate_sample_id(
+            sample_name, self._file_name))
+      else:
+        sample_id = hex(hashing_util.generate_sample_id(sample_name))
+      self._encoded_sample_names[sample_name] = sample_id
+    return sample_id
+
   def _get_variant_calls(self, samples):
     # type: (libcvcf.VariantRecordSamples) -> List[VariantCall]
     calls = []
@@ -623,6 +651,7 @@ def _get_variant_calls(self, samples):
       # before settings default phaseset value.
       if phaseset is None and sample.phased and len(genotype) > 1:
         phaseset = DEFAULT_PHASESET_VALUE
-      calls.append(VariantCall(name, genotype, phaseset, info))
+      encoded_name = self._lookup_encoded_sample_name(name)
+      calls.append(VariantCall(encoded_name, genotype, phaseset, info))
 
     return calls
@@ -47,6 +47,7 @@
 MISSING_GENOTYPE_VALUE = vcf_parser.MISSING_GENOTYPE_VALUE
 Variant = vcf_parser.Variant
 VariantCall = vcf_parser.VariantCall
+SampleNameEncoding = vcf_parser.SampleNameEncoding
 
 
 class _ToVcfRecordCoder(coders.Coder):
@@ -58,7 +59,6 @@ def encode(self, variant):
     encoded_info = self._encode_variant_info(variant)
     format_keys = self._get_variant_format_keys(variant)
     encoded_calls = self._encode_variant_calls(variant, format_keys)
-
     columns = [
         variant.reference_name,
         None if variant.start is None else variant.start + 1,
@@ -182,15 +182,17 @@ class _VcfSource(filebasedsource.FileBasedSource):
 
   DEFAULT_VCF_READ_BUFFER_SIZE = 65536  # 64kB
 
-  def __init__(self,
-               file_pattern,  # type: str
-               representative_header_lines=None,  # type: List[str]
-               compression_type=CompressionTypes.AUTO,  # type: str
-               buffer_size=DEFAULT_VCF_READ_BUFFER_SIZE,  # type: int
-               validate=True,  # type: bool
-               allow_malformed_records=False,  # type: bool
-               pre_infer_headers=False,  # type: bool
-              ):
+  def __init__(
+      self,
+      file_pattern,  # type: str
+      representative_header_lines=None,  # type: List[str]
+      compression_type=CompressionTypes.AUTO,  # type: str
+      buffer_size=DEFAULT_VCF_READ_BUFFER_SIZE,  # type: int
+      validate=True,  # type: bool
+      allow_malformed_records=False,  # type: bool
+      pre_infer_headers=False,  # type: bool
+      sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH  # type: int
+      ):
     # type: (...) -> None
     super(_VcfSource, self).__init__(file_pattern,
                                      compression_type=compression_type,
@@ -200,6 +202,8 @@ def __init__(self,
     self._buffer_size = buffer_size
     self._allow_malformed_records = allow_malformed_records
     self._pre_infer_headers = pre_infer_headers
+    self._sample_name_encoding = sample_name_encoding
+
 
   def read_records(self,
                    file_name,  # type: str
@@ -214,6 +218,7 @@ def read_records(self,
         file_pattern=self._pattern,
         representative_header_lines=self._representative_header_lines,
         pre_infer_headers=self._pre_infer_headers,
+        sample_name_encoding=self._sample_name_encoding,
         buffer_size=self._buffer_size,
         skip_header_lines=0)
 
@@ -229,7 +234,9 @@ def __init__(self,
                input_files,
                representative_header_lines,
                allow_malformed_records,
-               pre_infer_headers):
+               pre_infer_headers,
+               sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH
+              ):
     # type: (List[str], List[str], bool) -> None
     """Initializes the transform.
 
@@ -239,11 +246,16 @@ def __init__(self,
         VCF files.
       allow_malformed_records: If true, malformed records from VCF files will be
         returned as `MalformedVcfRecord` instead of failing the pipeline.
+      pre_infer_headers: If true, drop headers and make sure PySam return the
+        exact data for variants and calls, without type matching.
+      sample_name_encoding: specify how we want to encode sample_name mainly
+        to deal with same sample_name used across multiple VCF files.
     """
     self._input_files = input_files
     self._representative_header_lines = representative_header_lines
     self._allow_malformed_records = allow_malformed_records
     self._pre_infer_headers = pre_infer_headers
+    self._sample_name_encoding = sample_name_encoding
 
   def _read_records(self, (file_path, block)):
     # type: (Tuple[str, Block]) -> Iterable(Variant)
@@ -255,7 +267,8 @@ def _read_records(self, (file_path, block)):
         self._allow_malformed_records,
         representative_header_lines=self._representative_header_lines,
         splittable_bgzf=True,
-        pre_infer_headers=self._pre_infer_headers)
+        pre_infer_headers=self._pre_infer_headers,
+        sample_name_encoding=self._sample_name_encoding)
 
     for record in record_iterator:
       yield record
@@ -286,6 +299,7 @@ def __init__(
       validate=True,  # type: bool
       allow_malformed_records=False,  # type: bool
       pre_infer_headers=False,  # type: bool
+      sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,  # type: int
       **kwargs  # type: **str
       ):
     # type: (...) -> None
@@ -302,6 +316,10 @@ def __init__(
         underlying file_path's extension will be used to detect the compression.
       validate: flag to verify that the files exist during the pipeline creation
         time.
+      pre_infer_headers: If true, drop headers and make sure PySam return the
+        exact data for variants and calls, without type matching.
+      sample_name_encoding: specify how we want to encode sample_name mainly
+        to deal with same sample_name used across multiple VCF files
     """
     super(ReadFromVcf, self).__init__(**kwargs)
 
@@ -311,20 +329,23 @@ def __init__(
         compression_type,
         validate=validate,
         allow_malformed_records=allow_malformed_records,
-        pre_infer_headers=pre_infer_headers)
+        pre_infer_headers=pre_infer_headers,
+        sample_name_encoding=sample_name_encoding)
 
   def expand(self, pvalue):
     return pvalue.pipeline | Read(self._source)
 
 
 def _create_vcf_source(
     file_pattern=None, representative_header_lines=None, compression_type=None,
-    allow_malformed_records=None, pre_infer_headers=False):
+    allow_malformed_records=None, pre_infer_headers=False,
+    sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH):
   return _VcfSource(file_pattern=file_pattern,
                     representative_header_lines=representative_header_lines,
                     compression_type=compression_type,
                     allow_malformed_records=allow_malformed_records,
-                    pre_infer_headers=pre_infer_headers)
+                    pre_infer_headers=pre_infer_headers,
+                    sample_name_encoding=sample_name_encoding)
 
 
 class ReadAllFromVcf(PTransform):
@@ -348,6 +369,7 @@ def __init__(
       compression_type=CompressionTypes.AUTO,  # type: str
       allow_malformed_records=False,  # type: bool
       pre_infer_headers=False,  # type: bool
+      sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH,  # type: int
       **kwargs  # type: **str
       ):
     # type: (...) -> None
@@ -366,14 +388,19 @@ def __init__(
         underlying file_path's extension will be used to detect the compression.
       allow_malformed_records: If true, malformed records from VCF files will be
         returned as :class:`MalformedVcfRecord` instead of failing the pipeline.
+      pre_infer_headers: If true, drop headers and make sure PySam return the
+        exact data for variants and calls, without type matching.
+      sample_name_encoding: specify how we want to encode sample_name mainly
+        to deal with same sample_name used across multiple VCF files
     """
     super(ReadAllFromVcf, self).__init__(**kwargs)
     source_from_file = partial(
         _create_vcf_source,
         representative_header_lines=representative_header_lines,
         compression_type=compression_type,
         allow_malformed_records=allow_malformed_records,
-        pre_infer_headers=pre_infer_headers)
+        pre_infer_headers=pre_infer_headers,
+        sample_name_encoding=sample_name_encoding)
     self._read_all_files = filebasedsource.ReadAllFiles(
         True,  # splittable
         CompressionTypes.AUTO, desired_bundle_size,