Add a flag to include file in the sample_id hash (#526)

samanvp · web-flow · commit d9933341df91 · 2019-11-15T11:19:51.000-05:00
* Add a flag to include file in the sample_id hash

* Rename flag to samples_span_multiple_files
diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py
@@ -155,6 +155,13 @@ def add_arguments(self, parser):
               '[EXPERIMENTAL]'
              ).format(sample_info_table_schema_generator.TABLE_SUFFIX))
 
+    parser.add_argument(
+        '--samples_span_multiple_files',
+        type='bool', default=True, nargs='?', const=True,
+        help=('If True sample_id will be the hash of [sample_name] thus it '
+              'will be independent of file_path, otherwise hash of '
+              '[file_path, sample_name] will be used as sample_id. '))
+
     parser.add_argument(
         '--split_alternate_allele_info_fields',
         type='bool', default=True, nargs='?', const=True,
diff --git a/gcp_variant_transforms/transforms/sample_info_to_bigquery.py b/gcp_variant_transforms/transforms/sample_info_to_bigquery.py
@@ -24,11 +24,16 @@
 class ConvertSampleInfoToRow(beam.DoFn):
   """Extracts sample info from `VcfHeader` and converts it to a BigQuery row."""
 
-  def process(self, vcf_header):
-    # type: (vcf_header_io.VcfHeader) -> Dict[str, Union[int, str]]
+  def process(self, vcf_header, samples_span_multiple_files):
+    # type: (vcf_header_io.VcfHeader, bool) -> Dict[str, Union[int, str]]
     for sample in vcf_header.samples:
-      sample_id = hashing_util.generate_unsigned_hash_code(
-          [vcf_header.file_path, sample], max_hash_value=pow(2, 63))
+      if samples_span_multiple_files:
+        sample_id = hashing_util.generate_unsigned_hash_code(
+            [sample], max_hash_value=pow(2, 63))
+      else:
+        sample_id = hashing_util.generate_unsigned_hash_code(
+            [vcf_header.file_path, sample], max_hash_value=pow(2, 63))
+
       row = {
           sample_info_table_schema_generator.SAMPLE_ID: sample_id,
           sample_info_table_schema_generator.SAMPLE_NAME: sample,
@@ -40,24 +45,28 @@ def process(self, vcf_header):
 class SampleInfoToBigQuery(beam.PTransform):
   """Writes sample info to BigQuery."""
 
-  def __init__(self, output_table_prefix, append=False):
-    # type: (str, Dict[str, str], bool) -> None
+  def __init__(self, output_table_prefix, append=False,
+               samples_span_multiple_files=False):
+    # type: (str, Dict[str, str], bool, bool) -> None
     """Initializes the transform.
 
     Args:
       output_table_prefix: The prefix of the output BigQuery table.
       append: If true, existing records in output_table will not be
         overwritten. New records will be appended to those that already exist.
+      samples_span_multiple_files: If true, sample_id = hash#([sample_name]),
+        otherwise sample_id = hash#([file_path, sample_name]).
     """
     self._output_table = sample_info_table_schema_generator.compose_table_name(
         output_table_prefix, sample_info_table_schema_generator.TABLE_SUFFIX)
     self._append = append
+    self.samples_span_multiple_files = samples_span_multiple_files
     self._schema = sample_info_table_schema_generator.generate_schema()
 
   def expand(self, pcoll):
     return (pcoll
             | 'ConvertSampleInfoToBigQueryTableRow' >> beam.ParDo(
-                ConvertSampleInfoToRow())
+                ConvertSampleInfoToRow(self._samples_span_multiple_files))
             | 'WriteSampleInfoToBigQuery' >> beam.io.WriteToBigQuery(
                 self._output_table,
                 schema=self._schema,
diff --git a/gcp_variant_transforms/transforms/sample_info_to_bigquery_test.py b/gcp_variant_transforms/transforms/sample_info_to_bigquery_test.py
@@ -33,8 +33,6 @@ def test_convert_sample_info_to_row(self):
                                            file_path='file_1')
     vcf_header_2 = vcf_header_io.VcfHeader(samples=['Sample 1', 'Sample 2'],
                                            file_path='file_2')
-    file_path_to_file_hash = {'file_1': 'hash_1',
-                              'file_2': 'hash_2'}
     expected_rows = [
         {sample_info_table_schema_generator.SAMPLE_ID: 5961690698012655974,
          sample_info_table_schema_generator.SAMPLE_NAME: 'Sample 1',
@@ -55,7 +53,37 @@ def test_convert_sample_info_to_row(self):
         | transforms.Create([vcf_header_1, vcf_header_2])
         | 'ConvertToRow'
         >> transforms.ParDo(sample_info_to_bigquery.ConvertSampleInfoToRow(
-            file_path_to_file_hash)))
+            ), False))
+
+    assert_that(bigquery_rows, equal_to(expected_rows))
+    pipeline.run()
+
+  def test_convert_sample_info_to_row_without_file_in_hash(self):
+    vcf_header_1 = vcf_header_io.VcfHeader(samples=['Sample 1', 'Sample 2'],
+                                           file_path='file_1')
+    vcf_header_2 = vcf_header_io.VcfHeader(samples=['Sample 1', 'Sample 2'],
+                                           file_path='file_2')
+    expected_rows = [
+        {sample_info_table_schema_generator.SAMPLE_ID: 6721344017406412066,
+         sample_info_table_schema_generator.SAMPLE_NAME: 'Sample 1',
+         sample_info_table_schema_generator.FILE_PATH: 'file_1'},
+        {sample_info_table_schema_generator.SAMPLE_ID: 7224630242958043176,
+         sample_info_table_schema_generator.SAMPLE_NAME: 'Sample 2',
+         sample_info_table_schema_generator.FILE_PATH: 'file_1'},
+        {sample_info_table_schema_generator.SAMPLE_ID: 6721344017406412066,
+         sample_info_table_schema_generator.SAMPLE_NAME: 'Sample 1',
+         sample_info_table_schema_generator.FILE_PATH: 'file_2'},
+        {sample_info_table_schema_generator.SAMPLE_ID: 7224630242958043176,
+         sample_info_table_schema_generator.SAMPLE_NAME: 'Sample 2',
+         sample_info_table_schema_generator.FILE_PATH: 'file_2'}
+    ]
+    pipeline = test_pipeline.TestPipeline()
+    bigquery_rows = (
+        pipeline
+        | transforms.Create([vcf_header_1, vcf_header_2])
+        | 'ConvertToRow'
+        >> transforms.ParDo(sample_info_to_bigquery.ConvertSampleInfoToRow(
+            ), True))
 
     assert_that(bigquery_rows, equal_to(expected_rows))
     pipeline.run()
diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py
@@ -388,8 +388,10 @@ def _create_sample_info_table(pipeline,  # type: beam.Pipeline
                                          pipeline_mode,
                                          known_args.all_patterns)
   _ = (headers | 'SampleInfoToBigQuery' >>
-       sample_info_to_bigquery.SampleInfoToBigQuery(known_args.output_table,
-                                                    known_args.append))
+       sample_info_to_bigquery.SampleInfoToBigQuery(
+           known_args.output_table,
+           known_args.append,
+           known_args.samples_span_multiple_files))
 
 
 def run(argv=None):