Address first iteration of comments.

tneymanov · tneymanov · commit 0216007d6e2a · 2020-02-05T16:36:53.000-05:00
diff --git a/gcp_variant_transforms/vcf_to_bq.py b/gcp_variant_transforms/vcf_to_bq.py
@@ -389,7 +389,7 @@ def _run_annotation_pipeline(known_args, pipeline_args):
 def _create_sample_info_table(pipeline,  # type: beam.Pipeline
                               pipeline_mode,  # type: PipelineModes
                               known_args,  # type: argparse.Namespace,
-                              pipeline_args,  # type: List[str]
+                              temp_directory, # str
                              ):
   # type: (...) -> None
   headers = pipeline_common.read_headers(
@@ -399,8 +399,14 @@ def _create_sample_info_table(pipeline,  # type: beam.Pipeline
   _ = (headers | 'SampleInfoToBigQuery' >>
        sample_info_to_bigquery.SampleInfoToBigQuery(
            known_args.output_table,
+<<<<<<< HEAD
            SampleNameEncoding[known_args.sample_name_encoding],
            known_args.append))
+=======
+           temp_directory,
+           known_args.append,
+           known_args.samples_span_multiple_files))
+>>>>>>> Address first iteration of comments.
 
 
 def run(argv=None):
@@ -409,6 +415,8 @@ def run(argv=None):
   logging.info('Command: %s', ' '.join(argv or sys.argv))
   known_args, pipeline_args = pipeline_common.parse_args(argv,
                                                          _COMMAND_LINE_OPTIONS)
+  if known_args.output_table and '--temp_location' not in pipeline_args:
+    raise ValueError('--temp_location is required for BigQuery imports.')
   if known_args.auto_flags_experiment:
     _get_input_dimensions(known_args, pipeline_args)
 
@@ -484,6 +492,7 @@ def run(argv=None):
     num_shards = 1
 
   if known_args.output_table:
+<<<<<<< HEAD
     schema_file = tempfile.mkstemp(prefix=known_args.output_table,
                                    suffix=_BQ_SCHEMA_FILE_SUFFIX)[1]
     schema = (
@@ -495,6 +504,13 @@ def run(argv=None):
       file_to_write.write(schema_json)
 
     for i in range(num_shards):
+=======
+    temp_directory = pipeline_options.PipelineOptions(pipeline_args).view_as(
+        pipeline_options.GoogleCloudOptions).temp_location
+    if not temp_directory:
+      raise ValueError('--temp_location must be set when writing to BigQuery.')
+    for i in range(num_partitions):
+>>>>>>> Address first iteration of comments.
       table_suffix = ''
       if sharding and sharding.get_shard_name(i):
         table_suffix = '_' + sharding.get_shard_name(i)
@@ -511,7 +527,7 @@ def run(argv=None):
                    known_args.null_numeric_value_replacement)))
       if known_args.generate_sample_info_table:
         _create_sample_info_table(
-            pipeline, pipeline_mode, known_args, pipeline_args)
+            pipeline, pipeline_mode, known_args, temp_directory)
 
   if known_args.output_avro_path:
     # TODO(bashir2): Add an integration test that outputs to Avro files and
diff --git a/setup.py b/setup.py
@@ -38,7 +38,8 @@
     'google-api-python-client>=1.6',
     'intervaltree>=2.1.0,<2.2.0',
     'mmh3<2.6',
-    'google-cloud-storage',
+    # Refer to issue #528
+    'google-cloud-storage<1.23.0',
     'pyfarmhash',
     'pyyaml'
 ]