googlegenomics
diff --git a/‎README.md‎
Lines changed: 10 additions & 12 deletions b/‎README.md‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎docker/pipelines_runner.sh‎
Lines changed: 9 additions & 9 deletions b/‎docker/pipelines_runner.sh‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/bigquery_to_vcf.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/bigquery_to_vcf.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/setting_region.md‎
Lines changed: 70 additions & 0 deletions b/‎docs/setting_region.md‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎docs/setting_zone_region.md‎
Lines changed: 0 additions & 34 deletions b/‎docs/setting_zone_region.md‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎docs/troubleshooting.md‎
Lines changed: 10 additions & 6 deletions b/‎docs/troubleshooting.md‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎docs/vcf_files_preprocessor.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/vcf_files_preprocessor.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/no_options.json‎
Lines changed: 0 additions & 1 deletion b/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/no_options.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_allow_incompatible_schema.json‎
Lines changed: 0 additions & 1 deletion b/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_allow_incompatible_schema.json‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_customized_export.json‎
Lines changed: 0 additions & 1 deletion b/‎gcp_variant_transforms/testing/integration/bq_to_vcf_tests/option_customized_export.json‎
Lines changed: 0 additions & 1 deletion
@@ -53,6 +53,9 @@ Run the script below and replace the following parameters:
 
 * `GOOGLE_CLOUD_PROJECT`: This is your project ID that contains the BigQuery
   dataset.
+* `GOOGLE_CLOUD_REGION`: You must choose a geographic region for Cloud Dataflow
+  to process your data, for example: `us-west1`. For more info about regions
+  please refer to [Setting Regions](docs/setting_region.md).
 * `INPUT_PATTERN`: A location in Google Cloud Storage where the
   VCF file are stored. You may specify a single file or provide a pattern to
   load multiple files at once. Please refer to the
@@ -69,6 +72,7 @@ Run the script below and replace the following parameters:
 #!/bin/bash
 # Parameters to replace:
 GOOGLE_CLOUD_PROJECT=GOOGLE_CLOUD_PROJECT
+GOOGLE_CLOUD_REGION=GOOGLE_CLOUD_REGION
 INPUT_PATTERN=gs://BUCKET/*.vcf
 OUTPUT_TABLE=GOOGLE_CLOUD_PROJECT:BIGQUERY_DATASET.BIGQUERY_TABLE
 TEMP_LOCATION=gs://BUCKET/temp
@@ -83,15 +87,15 @@ COMMAND="vcf_to_bq \
 docker run -v ~/.config:/root/.config \
   gcr.io/cloud-lifesciences/gcp-variant-transforms \
   --project "${GOOGLE_CLOUD_PROJECT}" \
-  --zones us-west1-b \
+  --region "${GOOGLE_CLOUD_REGION}" \
   "${COMMAND}"
 ```
-The flags `--project` and `--zones` are optional, given that these properties
-are set in your local configuration. You may set the default project and zones
-using the following commands:
+Both `--project` and `--region` flags are needed unless their default values
+are set in your local `gcloud` configuration. You may set the default project
+and region using the following commands:
 ```bash
 gcloud config set project GOOGLE_CLOUD_PROJECT
-gcloud config set compute/zone ZONE
+gcloud config set compute/region REGION
 ```
 
 The underlying pipeline uses
@@ -143,13 +147,13 @@ python -m gcp_variant_transforms.vcf_to_bq \
   --input_pattern gs://BUCKET/*.vcf \
   --output_table GOOGLE_CLOUD_PROJECT:BIGQUERY_DATASET.BIGQUERY_TABLE \
   --project "${GOOGLE_CLOUD_PROJECT}" \
+  --region "${GOOGLE_CLOUD_REGION}" \
   --temp_location gs://BUCKET/temp \
   --job_name vcf-to-bigquery \
   --setup_file ./setup.py \
   --runner DataflowRunner
 ```
 
-
 ## Running VCF files preprocessor
 
 The VCF files preprocessor is used for validating the datasets such that the
@@ -165,12 +169,6 @@ The BigQuery to VCF pipeline is used to export variants in BigQuery to one VCF f
 Please refer to [BigQuery to VCF pipeline](docs/bigquery_to_vcf.md) for more
 details.
 
-## Running jobs in a particular region/zone
-
-You may need to constrain Cloud Dataflow job processing to a specific geographic
-region in support of your project’s security and compliance needs. See
-[Setting zone/region doc](docs/setting_zone_region.md).
-
 
 ## Additional topics
 
 
@@ -22,7 +22,7 @@ set -euo pipefail
 #################################################
 function parse_args {
   # getopt command is only for checking arguments.
-  getopt -o '' -l project:,temp_location:,docker_image:,zones: -- "$@"
+  getopt -o '' -l project:,temp_location:,docker_image:,region: -- "$@"
   while [[ "$#" -gt 0 ]]; do
     case "$1" in
       --project)
@@ -37,8 +37,8 @@ function parse_args {
         vt_docker_image="$2"
         ;;
 
-      --zones)
-        zones="$2"
+      --region)
+        region="$2"
         ;;
 
       *)
@@ -58,7 +58,7 @@ function main {
 
   google_cloud_project="${google_cloud_project:-$(gcloud config get-value project)}"
   vt_docker_image="${vt_docker_image:-gcr.io/cloud-lifesciences/gcp-variant-transforms:${COMMIT_SHA}}"
-  zones="${zones:-$(gcloud config get-value compute/zone)}"
+  region="${region:-$(gcloud config get-value compute/region)}"
   temp_location="${temp_location:-''}"
 
   if [[ -z "${google_cloud_project}" ]]; then
@@ -67,9 +67,9 @@ function main {
     exit 1
   fi
 
-  if [[ -z "${zones}" ]]; then
-    echo "Please set the zones using flags --zones."
-    echo "Or set default zone in your local client configuration using gcloud config set compute/zone ZONE."
+  if [[ -z "${region}" ]]; then
+    echo "Please set the region using flags --region."
+    echo "Or set default region in your local client configuration using gcloud config set compute/region REGION."
     exit 1
   fi
 
@@ -79,11 +79,11 @@ function main {
   fi
 
   pipelines --project "${google_cloud_project}" run \
-    --command "/opt/gcp_variant_transforms/bin/${command} --project ${google_cloud_project}" \
+    --command "/opt/gcp_variant_transforms/bin/${command} --project ${google_cloud_project} --region ${region}" \
     --output "${temp_location}"/runner_logs_$(date +%Y%m%d_%H%M%S).log \
     --wait \
     --scopes "https://www.googleapis.com/auth/cloud-platform" \
-    --zones "${zones}" \
+    --regions "${region}" \
     --image "${vt_docker_image}" \
     --pvm-attempts 0 \
     --attempts 1 \
 
@@ -44,7 +44,7 @@ COMMAND="bq_to_vcf \
 docker run -v ~/.config:/root/.config \
   gcr.io/cloud-lifesciences/gcp-variant-transforms \
   --project "${GOOGLE_CLOUD_PROJECT}" \
-  --zones us-west1-b \
+  --region us-west1 \
   "${COMMAND}"
 ```
 
 
@@ -0,0 +1,70 @@
+# Setting GCP region
+
+## What to consider
+
+Google Cloud Platform services are available in [many
+locations](https://cloud.google.com/about/locations/) across the globe.
+You can minimize network latency and network transport costs by running your
+Dataflow job in the same region as its input bucket, output dataset, and
+temporary directory are located. More specifically, in order to run Variant
+Transforms most efficiently you should make sure all the following resources
+are located in the same region:
+* Your source bucket set by  `--input_pattern` flag.
+* Your pipeline's temporary location set by `--temp_location` flag.
+* Your output BigQuery dataset set by `--output_table` flag.
+* Your Dataflow pipeline set by `--region` flag.
+
+## Running jobs in a particular region
+The Dataflow API [requires](https://beam.apache.org/blog/2019/08/22/beam-2.15.0.html)
+setting a [GCP
+region](https://cloud.google.com/compute/docs/regions-zones/#available) via
+`--region` flag to run. In addition to this requirment you might also
+choose to run Variant Transforms in a specific region following your project’s
+security and compliance requirements. For example, in order
+to restrict your processing job to Europe, update the region as follows:
+
+```bash
+COMMAND="/opt/gcp_variant_transforms/bin/vcf_to_bq ...
+
+docker run gcr.io/cloud-lifesciences/gcp-variant-transforms \
+  --project "${GOOGLE_CLOUD_PROJECT}" \
+  --region "${GOOGLE_CLOUD_REGION}" \
+  "${COMMAND}"
+```
+
+Note that values of `--project` and `--region` flags will be automatically
+passed as `COMMAND` args in [`piplines_runner.sh`](docker/pipelines_runner.sh).
+Alternatively, you can set your default region using the following command:
+
+```bash
+gcloud config set compute/region "europe-west1"
+```
+
+In this case you do not need to set the `--region` flag any more. For more
+information please refer to this [cloud SDK page](https://cloud.google.com/sdk/gcloud/reference/config/set).
+
+If you are running Variant Transforms from GitHub, you just need to specify
+region for the Dataflow API as below.
+
+```bash
+python -m gcp_variant_transforms.vcf_to_bq ... \
+  --project "${GOOGLE_CLOUD_PROJECT}" \
+  --region "${GOOGLE_CLOUD_REGION}" \
+```
+
+## Setting Google Cloud Storage bucket region
+
+You can choose your [GCS bucket's region](https://cloud.google.com/storage/docs/locations)
+when you are [creating it](https://cloud.google.com/storage/docs/creating-buckets#storage-create-bucket-console).
+When you create a bucket, you [permanently
+define](https://cloud.google.com/storage/docs/moving-buckets#storage-create-bucket-console)
+its name, its geographic location, and the project it is part of. For an existing bucket, you can check
+[its information](https://cloud.google.com/storage/docs/getting-bucket-information) to find out 
+about its geographic location.
+
+## Setting BigQuery dataset region 
+
+You can choose the region for the BigQuery dataset at dataset creation time.
+
+![BigQuery dataset region](images/bigquery_dataset_region.png)
+
@@ -14,13 +14,17 @@ group or file a GitHub issue if you believe that there is a bug in the pipeline.
   [predefined machine types](https://cloud.google.com/compute/pricing#predefined_machine_types)
   for the full list.
 * Ensure you have enough [quota](https://cloud.google.com/compute/quotas) in the
-  zone/region running the pipeline. By default, the pipeline runs in the
-  `us-central1` region. You may change this by specifying `--region <region>`
-  or `--zone <zone>` when running the pipeline. You can check for quota issues
-  by navigating to the
-  [Compute Engine quotas page](https://console.cloud.google.com/iam-admin/quotas?service=compute.googleapis.com)
+  region running the pipeline. You need to [set a region](./setting_region.md) 
+  for running the pipeline by specifying `--region <region>`. You can check for
+  quota issues by navigating to the [Compute Engine quotas page](https://console.cloud.google.com/iam-admin/quotas?service=compute.googleapis.com)
   while the pipeline is running, which shows saturated quotas at the top of the
-  page.
+  page (highlighted in red).
+* Ensure your source GCS bucket is located in the same region as where you are
+  running your Dataflow pipeline. According to [data
+  locality](https://cloud.google.com/dataflow/docs/concepts/regional-endpoints#data_locality)
+  guidelines the GCS bucket containing your VCF files as well as the temporary
+  directory of your pipeline should be located in the same region as your
+  Dataflow pipeline.
 * `gzip` and `bzip2` file formats cannot be sharded, which considerably slows
   down the pipeline. Consider decompressing the files prior to running the
   pipeline. You may use [dsub](https://github.com/googlegenomics/dsub) to write
 
@@ -83,7 +83,7 @@ COMMAND="vcf_to_bq_preprocess \
 docker run -v ~/.config:/root/.config \
   gcr.io/cloud-lifesciences/gcp-variant-transforms \
   --project "${GOOGLE_CLOUD_PROJECT}" \
-  --zones us-west1-b \
+  --region us-west1 \
   "${COMMAND}"
 ```
 
 
@@ -4,7 +4,6 @@
     "input_table": "gcp-variant-transforms-test:bq_to_vcf_integration_tests.4_0",
     "output_file_name": "bq_to_vcf_no_options.vcf",
     "runner": "DirectRunner",
-    "zones": ["us-west1-b"],
     "expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf"
   }
 ]
@@ -5,7 +5,6 @@
     "output_file_name": "bq_to_vcf_option_allow_incompatible_schema.vcf",
     "allow_incompatible_schema": true,
     "runner": "DirectRunner",
-    "zones": ["us-west1-b"],
     "expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf"
   }
 ]
@@ -6,7 +6,6 @@
     "genomic_regions": "19:1234566-1234570 20:14369-17330",
     "call_names": "NA00001 NA00003",
     "runner": "DirectRunner",
-    "zones": ["us-west1-b"],
     "expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf"
   }
 ]
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,6 @@`
`4`	`4`	`"input_table": "gcp-variant-transforms-test:bq_to_vcf_integration_tests.4_0",`
`5`	`5`	`"output_file_name": "bq_to_vcf_no_options.vcf",`
`6`	`6`	`"runner": "DirectRunner",`
`7`		`- "zones": ["us-west1-b"],`
`8`	`7`	`"expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/no_options.vcf"`
`9`	`8`	`}`
`10`	`9`	`]`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`"output_file_name": "bq_to_vcf_option_allow_incompatible_schema.vcf",`
`6`	`6`	`"allow_incompatible_schema": true,`
`7`	`7`	`"runner": "DirectRunner",`
`8`		`- "zones": ["us-west1-b"],`
`9`	`8`	`"expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_allow_incompatible_schema.vcf"`
`10`	`9`	`}`
`11`	`10`	`]`
Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,6 @@`
`6`	`6`	`"genomic_regions": "19:1234566-1234570 20:14369-17330",`
`7`	`7`	`"call_names": "NA00001 NA00003",`
`8`	`8`	`"runner": "DirectRunner",`
`9`		`- "zones": ["us-west1-b"],`
`10`	`9`	`"expected_output_file": "gcp_variant_transforms/testing/data/vcf/bq_to_vcf/expected_output/option_customized_export.vcf"`
`11`	`10`	`}`
`12`	`11`	`]`