Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 56 additions & 44 deletions rules/pre-mehari/snakefiles/scripts/clinvar-to-tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def print_for_postgres(self, file: typing.TextIO = sys.stdout):
}
PATHOGENICITIES_INV = {v: k for k, v in PATHOGENICITIES.items()}
REVIEW_STATUS_LABELS: dict[clinvar_public.AggregateGermlineReviewStatus.ValueType, str] = {
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_CONFLICTING_CLASSIFICATIONS: "criteria provided, conflicting classifications ",
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_CONFLICTING_CLASSIFICATIONS: "criteria provided, conflicting classifications",
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_MULTIPLE_SUBMITTERS_NO_CONFLICTS: "criteria provided, multiple submitters, no conflicts",
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_CRITERIA_PROVIDED_SINGLE_SUBMITTER: "criteria provided, single submitter",
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_NO_ASSERTION_CRITERIA_PROVIDED: "no assertion criteria provided",
Expand Down Expand Up @@ -137,46 +137,26 @@ def extracted_var_to_clinvar_record(
variation_type = "indel"
symbols = list(filter(lambda x: bool(x), map(hgnc_map.get, record.hgnc_ids)))

if "conflicting" in germline_classification.description.lower():
summary_clinvar_pathogenicity = ["uncertain significance"]
summary_clinvar_pathogenicity_label = "conflicting interpretations of pathogenicity"
summary_clinvar_review_status_label = "criteria provided, conflicting classifications"
summary_clinvar_gold_stars = 0
else:
description_tokens = (
token.strip().lower()
for token in re.split(r"[/,;]", germline_classification.description)
)
summary_clinvar_pathogenicity = []
summary_clinvar_pathogenicity_label = "uncertain significance"
for token in description_tokens:
for key in PATHOGENICITIES.keys():
if key == token:
summary_clinvar_pathogenicity.append(key)
if summary_clinvar_pathogenicity:
summary_clinvar_pathogenicity_label = "/".join(summary_clinvar_pathogenicity)
else:
summary_clinvar_pathogenicity = ["uncertain significance"]
summary_clinvar_pathogenicity_label = "uncertain significance"
summary_clinvar_review_status_label = REVIEW_STATUS_LABELS[
germline_classification.review_status
]
summary_clinvar_gold_stars = REVIEW_STATUS_STARS[germline_classification.review_status]
# Summary ClinVar fields are discontinued
summary_clinvar_pathogenicity = []
summary_clinvar_pathogenicity_label = ""
summary_clinvar_review_status_label = ""
summary_clinvar_gold_stars = 0
Comment on lines +140 to +144
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# Search for all usages of summary_clinvar fields in Python files
rg -n "summary_clinvar" --type py

Repository: varfish-org/varfish-db-downloader

Length of output: 2247


🏁 Script executed:

# Read the context around the initialization lines (140-144)
head -n 200 rules/pre-mehari/snakefiles/scripts/clinvar-to-tsv.py | tail -n +120

Repository: varfish-org/varfish-db-downloader

Length of output: 4055


🏁 Script executed:

# Check the TSV output headers to see what fields are included
sed -n '240,270p' rules/pre-mehari/snakefiles/scripts/clinvar-to-tsv.py

Repository: varfish-org/varfish-db-downloader

Length of output: 797


🏁 Script executed:

# Check if there's any documentation or changelog about this change
rg -n "paranoid\|discontinued" --type py --type md --type txt -i

Repository: varfish-org/varfish-db-downloader

Length of output: 59


Deprecation of summary_clinvar fields is intentional but document the transition.

The discontinuation of summary_clinvar_* fields (set to empty values at lines 140-144) is an intentional refactoring in favor of summary_paranoid_* fields, which are calculated using improved classification logic. Since both old and new fields are preserved in the TSV output, backward compatibility is maintained. However, add a clear comment or documentation note explaining that downstream consumers should migrate to the new summary_paranoid_* fields, as the legacy fields will remain empty.

🤖 Prompt for AI Agents
rules/pre-mehari/snakefiles/scripts/clinvar-to-tsv.py around lines 140-144: the
summary_clinvar_* variables are intentionally set to empty for a migration to
summary_paranoid_* logic; add a short inline comment above these assignments
stating that the legacy summary_clinvar_* fields are deprecated and
intentionally left blank, that downstream consumers should migrate to
summary_paranoid_* fields (which are the new authoritative values), and that
both old and new columns remain in the TSV for backward compatibility; also add
a brief note to the TSV output generation (or repository README) indicating the
deprecation and recommended migration target so users know to stop relying on
the empty legacy fields.


summary_paranoid_review_status_label = summary_clinvar_review_status_label
summary_paranoid_pathogenicity_label = summary_clinvar_pathogenicity_label
summary_paranoid_pathogenicity = summary_clinvar_pathogenicity
summary_paranoid_gold_stars = summary_clinvar_gold_stars
if "conflicting" in summary_clinvar_pathogenicity_label:
# look through the SCVs
worst = None
# Process paranoid fields from actual ClinVar data
if "conflicting" in germline_classification.description.lower():
# look through the SCVs to collect all contributing pathogenicity values
contributing_pathogenicities = set()
for clinical_assertion in record.clinical_assertions:
if clinical_assertion.HasField(
"classifications"
) and clinical_assertion.classifications.HasField("germline_classification"):
if clinical_assertion.classifications.review_status not in (
clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_NO_CLASSIFICATION_PROVIDED,
# clinvar_public.AggregateGermlineReviewStatus.AGGREGATE_GERMLINE_REVIEW_STATUS_NO_ASSERTION_CRITERIA_PROVIDED,
clinvar_public.SubmitterReviewStatus.SUBMITTER_REVIEW_STATUS_UNSPECIFIED,
clinvar_public.SubmitterReviewStatus.SUBMITTER_REVIEW_STATUS_NO_CLASSIFICATION_PROVIDED,
clinvar_public.SubmitterReviewStatus.SUBMITTER_REVIEW_STATUS_NO_ASSERTION_CRITERIA_PROVIDED,
clinvar_public.SubmitterReviewStatus.SUBMITTER_REVIEW_STATUS_FLAGGED_SUBMISSION,
clinvar_public.SubmitterReviewStatus.SUBMITTER_REVIEW_STATUS_NOT_CLASSIFIED_BY_SUBMITTER,
):
description_tokens = [
token.strip().lower()
Expand All @@ -186,15 +166,47 @@ def extracted_var_to_clinvar_record(
]
for token in description_tokens:
for key in PATHOGENICITIES.keys():
if PATHOGENICITIES[key] is not None and (
worst is None or PATHOGENICITIES[key] > worst
):
worst = PATHOGENICITIES[key]
if worst is not None and PATHOGENICITIES.get(summary_clinvar_pathogenicity[0], 0) < worst:
# override paranoid
summary_paranoid_pathogenicity_label = PATHOGENICITIES_INV[worst]
summary_paranoid_pathogenicity = [PATHOGENICITIES_INV[worst]]
summary_paranoid_gold_stars = 0
if key == token:
contributing_pathogenicities.add(key)

if contributing_pathogenicities:
# Sort by pathogenicity value (most pathogenic first)
sorted_pathogenicities = sorted(
contributing_pathogenicities,
key=lambda x: PATHOGENICITIES.get(x, 0),
reverse=True
)
summary_paranoid_pathogenicity = sorted_pathogenicities
summary_paranoid_pathogenicity_label = "/".join(sorted_pathogenicities)
else:
# No valid contributing pathogenicities found, fall back to uncertain significance
summary_paranoid_pathogenicity = ["uncertain significance"]
summary_paranoid_pathogenicity_label = "uncertain significance"
summary_paranoid_review_status_label = REVIEW_STATUS_LABELS[
germline_classification.review_status
]
summary_paranoid_gold_stars = REVIEW_STATUS_STARS[germline_classification.review_status]
else:
# Non-conflicting case: parse the germline classification description
description_tokens = (
token.strip().lower()
for token in re.split(r"[/,;]", germline_classification.description)
)
summary_paranoid_pathogenicity = []
summary_paranoid_pathogenicity_label = "uncertain significance"
for token in description_tokens:
for key in PATHOGENICITIES.keys():
if key == token:
summary_paranoid_pathogenicity.append(key)
if summary_paranoid_pathogenicity:
summary_paranoid_pathogenicity_label = "/".join(summary_paranoid_pathogenicity)
else:
summary_paranoid_pathogenicity = ["uncertain significance"]
summary_paranoid_pathogenicity_label = "uncertain significance"
summary_paranoid_review_status_label = REVIEW_STATUS_LABELS[
germline_classification.review_status
]
summary_paranoid_gold_stars = REVIEW_STATUS_STARS[germline_classification.review_status]

return Clinvar(
release=release,
Expand Down
Loading