Address first iteration of comments.

tneymanov · tneymanov · commit 5f38e93f9764 · 2020-01-14T00:04:43.000-05:00
diff --git a/gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/small_tests/valid_4_1_pysam.json b/gcp_variant_transforms/testing/integration/vcf_to_bq_tests/presubmit_tests/small_tests/valid_4_1_pysam.json
diff --git a/gcp_variant_transforms/transforms/sample_info_to_bigquery.py b/gcp_variant_transforms/transforms/sample_info_to_bigquery.py
@@ -24,10 +24,15 @@
 class ConvertSampleInfoToRow(beam.DoFn):
   """Extracts sample info from `VcfHeader` and converts it to a BigQuery row."""
 
-  def process(self, vcf_header, samples_span_multiple_files):
+  def __init__(self,
+               samples_span_multiple_files=False,  # type: bool
+              ):
+    self._samples_span_multiple_files = samples_span_multiple_files
+
+  def process(self, vcf_header):
     # type: (vcf_header_io.VcfHeader, bool) -> Dict[str, Union[int, str]]
     for sample in vcf_header.samples:
-      if samples_span_multiple_files:
+      if self._samples_span_multiple_files:
         sample_id = hashing_util.generate_unsigned_hash_code(
             [sample], max_hash_value=pow(2, 63))
       else:
@@ -45,7 +50,7 @@ def process(self, vcf_header, samples_span_multiple_files):
 class SampleInfoToBigQuery(beam.PTransform):
   """Writes sample info to BigQuery."""
 
-  def __init__(self, output_table_prefix, append=False,
+  def __init__(self, output_table_prefix, temp_location, append=False,
                samples_span_multiple_files=False):
     # type: (str, Dict[str, str], bool, bool) -> None
     """Initializes the transform.
@@ -60,7 +65,7 @@ def __init__(self, output_table_prefix, append=False,
     self._output_table = sample_info_table_schema_generator.compose_table_name(
         output_table_prefix, sample_info_table_schema_generator.TABLE_SUFFIX)
     self._append = append
-    self.samples_span_multiple_files = samples_span_multiple_files
+    self._samples_span_multiple_files = samples_span_multiple_files
     self._schema = sample_info_table_schema_generator.generate_schema()
     self._temp_location = temp_location
 
diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py
@@ -384,7 +384,7 @@ def _run_annotation_pipeline(known_args, pipeline_args):
 def _create_sample_info_table(pipeline,  # type: beam.Pipeline
                               pipeline_mode,  # type: PipelineModes
                               known_args,  # type: argparse.Namespace,
-                              pipeline_args,  # type: List[str]
+                              temp_directory, # str
                              ):
   # type: (...) -> None
   headers = pipeline_common.read_headers(
@@ -395,6 +395,7 @@ def _create_sample_info_table(pipeline,  # type: beam.Pipeline
   _ = (headers | 'SampleInfoToBigQuery' >>
        sample_info_to_bigquery.SampleInfoToBigQuery(
            known_args.output_table,
+           temp_directory,
            known_args.append,
            known_args.samples_span_multiple_files))
 
@@ -405,6 +406,8 @@ def run(argv=None):
   logging.info('Command: %s', ' '.join(argv or sys.argv))
   known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                          _COMMAND_LINE_OPTIONS)
+  if known_args.output_table and '--temp_location' not in pipeline_args:
+    raise ValueError('--temp_location is required for BigQuery imports.')
   if known_args.auto_flags_experiment:
     _get_input_dimensions(known_args, pipeline_args)
 
@@ -480,8 +483,10 @@ def run(argv=None):
     num_partitions = 1
 
   if known_args.output_table:
-    options = pipeline_options.PipelineOptions(pipeline_args)
-    google_cloud_options = options.view_as(pipeline_options.GoogleCloudOptions)
+    temp_directory = pipeline_options.PipelineOptions(pipeline_args).view_as(
+        pipeline_options.GoogleCloudOptions).temp_location
+    if not temp_directory:
+      raise ValueError('--temp_location must be set when writing to BigQuery.')
     for i in range(num_partitions):
       table_suffix = ''
       if partitioner and partitioner.get_partition_name(i):
@@ -491,7 +496,7 @@ def run(argv=None):
            variant_to_bigquery.VariantToBigQuery(
                table_name,
                header_fields,
-               google_cloud_options.temp_location,
+               temp_directory,
                variant_merger,
                processed_variant_factory,
                append=known_args.append,
@@ -502,7 +507,7 @@ def run(argv=None):
                    known_args.null_numeric_value_replacement)))
       if known_args.generate_sample_info_table:
         _create_sample_info_table(
-            pipeline, pipeline_mode, known_args, pipeline_args)
+            pipeline, pipeline_mode, known_args, temp_directory)
 
   if known_args.output_avro_path:
     # TODO(bashir2): Add an integration test that outputs to Avro files and
diff --git a/setup.py b/setup.py
@@ -42,7 +42,8 @@
     # Nucleus needs uptodate protocol buffer compiler (protoc).
     'protobuf>=3.6.1',
     'mmh3<2.6',
-    'google-cloud-storage',
+    # Refer to issue #528
+    'google-cloud-storage<1.23.0',
     'pyfarmhash',
     'pyyaml'
 ]