From 905e894e026d0ac37de42cbc31eae9f9b76a9e5f Mon Sep 17 00:00:00 2001
From: sujata-m <sujatam@gaussiansolutions.com>
Date: Mon, 15 Dec 2025 22:17:36 +0530
Subject: [PATCH] [0.3.2] Updated unit test cases

- Reject non-OSW GeoJSON files (anything beyond points/nodes/edges/lines/polygons/zones) with a clear error; added unit coverage for unsupported files and extension validation paths.
- Clarified extension handling expectations: external extension ZIPs are now considered invalid inputs in tests.
- Added a detailed scenario summary (`unit_test_overview.md`) and renamed log-oriented tests for clarity; removed noisy traceback printing on unexpected validation errors.
---
 CHANGELOG.md                                  |   7 +-
 .../extracted_data_validator.py               |  10 ++
 src/python_osw_validation/version.py          |   2 +-
 .../test_extracted_data_validator.py          |  10 ++
 tests/unit_tests/test_osw_validation.py       |  12 +-
 .../unit_tests/test_osw_validation_extras.py  | 150 ++++++++++++++++--
 unit_test_overview.md                         |  78 +++++++++
 7 files changed, 248 insertions(+), 21 deletions(-)
 create mode 100644 unit_test_overview.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6787619..6325fdc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,11 @@
 # Change log
 
-### 0.3.1 - 2025-02-12
+### 0.3.2 - 2025-12-15
+- Reject non-OSW GeoJSON files (anything beyond points/nodes/edges/lines/polygons/zones) with a clear error; added unit coverage for unsupported files and extension validation paths.
+- Clarified extension handling expectations: external extension ZIPs are now considered invalid inputs in tests.
+- Added a detailed scenario summary (`unit_test_overview.md`) and renamed log-oriented tests for clarity; removed noisy traceback printing on unexpected validation errors.
+
+### 0.3.1 - 2025-12-15
 - Restored custom field support across schemas (CustomNode/Edge/Zone) while aligning required fields with original behavior.
 - Relaxed edges/zones global required lists to let custom features omit `highway` (kept in branch-specific schemas).
 - Enhanced 0.2 compatibility guard: allow `ext:` on nodes, block custom content per dataset with specific messages (e.g., Custom Edge/Point/Polygon), and reject non-point `ext:`.
diff --git a/src/python_osw_validation/extracted_data_validator.py b/src/python_osw_validation/extracted_data_validator.py
index 901205f..52abdf1 100644
--- a/src/python_osw_validation/extracted_data_validator.py
+++ b/src/python_osw_validation/extracted_data_validator.py
@@ -98,6 +98,16 @@ def is_valid(self) -> bool:
             gc.collect()
             return True
 
+        allowed_keys = tuple(OSW_DATASET_FILES.keys())
+        unsupported_files = sorted(
+            {bn for bn in basenames if not any(key in bn for key in allowed_keys)}
+        )
+        if unsupported_files:
+            allowed_fmt = ", ".join(allowed_keys)
+            self.error = (f"Unsupported .geojson files present: {', '.join(unsupported_files)}. "
+                          f"Allowed file types are {{{allowed_fmt}}}")
+            return False
+
         required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']]
         optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
         missing_files = []
diff --git a/src/python_osw_validation/version.py b/src/python_osw_validation/version.py
index e1424ed..73e3bb4 100644
--- a/src/python_osw_validation/version.py
+++ b/src/python_osw_validation/version.py
@@ -1 +1 @@
-__version__ = '0.3.1'
+__version__ = '0.3.2'
diff --git a/tests/unit_tests/test_extracted_data_validator.py b/tests/unit_tests/test_extracted_data_validator.py
index 24badcc..f6ee874 100644
--- a/tests/unit_tests/test_extracted_data_validator.py
+++ b/tests/unit_tests/test_extracted_data_validator.py
@@ -99,6 +99,16 @@ def test_duplicate_required_files_detected(self):
             self.assertFalse(validator.is_valid())
             self.assertEqual(validator.error, 'Multiple .geojson files of the same type found: nodes.')
 
+    def test_unsupported_files_are_rejected(self):
+        validator = ExtractedDataValidator(self.test_dir)
+        self.create_files(['a.nodes.geojson', 'something_else.geojson'])
+        self.assertFalse(validator.is_valid())
+        self.assertEqual(
+            validator.error,
+            'Unsupported .geojson files present: something_else.geojson. '
+            'Allowed file types are {edges, nodes, points, lines, zones, polygons}'
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit_tests/test_osw_validation.py b/tests/unit_tests/test_osw_validation.py
index 13d3dde..167f92f 100644
--- a/tests/unit_tests/test_osw_validation.py
+++ b/tests/unit_tests/test_osw_validation.py
@@ -166,22 +166,22 @@ def test_points_invalid_zipfile_with_invalid_schema(self):
     def test_external_extension_file_inside_zipfile(self):
         validation = OSWValidation(zipfile_path=self.external_extension_file_zipfile)
         result = validation.validate()
-        self.assertTrue(result.is_valid)
-        self.assertIsNone(result.errors)
+        self.assertFalse(result.is_valid)
+        self.assertIsNotNone(result.errors)
 
     def test_external_extension_file_inside_zipfile_with_schema(self):
         validation = OSWValidation(zipfile_path=self.external_extension_file_zipfile,
                                    schema_paths=self.schema_paths)
         result = validation.validate()
-        self.assertTrue(result.is_valid)
-        self.assertIsNone(result.errors)
+        self.assertFalse(result.is_valid)
+        self.assertIsNotNone(result.errors)
 
     def test_external_extension_file_inside_zipfile_with_invalid_schema(self):
         validation = OSWValidation(zipfile_path=self.external_extension_file_zipfile,
                                    schema_file_path=self.invalid_schema_file_path)
         result = validation.validate()
-        self.assertTrue(result.is_valid)
-        self.assertIsNone(result.errors)
+        self.assertFalse(result.is_valid)
+        self.assertIsNotNone(result.errors)
 
     def test_id_missing_zipfile(self):
         validation = OSWValidation(zipfile_path=self.id_missing_zipfile)
diff --git a/tests/unit_tests/test_osw_validation_extras.py b/tests/unit_tests/test_osw_validation_extras.py
index 0f4818a..3f7ef52 100644
--- a/tests/unit_tests/test_osw_validation_extras.py
+++ b/tests/unit_tests/test_osw_validation_extras.py
@@ -73,8 +73,8 @@ def _fake_validator(self, files, external_exts=None, valid=True, error="folder i
 
     # ---------------- tests ----------------
 
-    def test_missing_u_id_logged_and_no_keyerror(self):
-        """Edges missing `_u_id` should log a friendly error instead of raising KeyError."""
+    def test_missing_u_id_reports_error_without_keyerror(self):
+        """Edges missing `_u_id` should report a friendly error instead of raising KeyError."""
         fake_files = ["/tmp/nodes.geojson", "/tmp/edges.geojson"]
         nodes = self._gdf_nodes([1, 2])
         # edges WITHOUT _u_id; include _id to bypass duplicated('_id') KeyError
@@ -175,7 +175,7 @@ def _rf(path):
                 shown_ids = [x.strip() for x in displayed.split(",")]
                 self.assertLessEqual(len(shown_ids), 20)
 
-    def test_load_osw_file_logs_json_decode_error(self):
+    def test_load_osw_file_reports_json_decode_error(self):
         """Invalid JSON should surface a detailed message with location context."""
         validator = OSWValidation(zipfile_path="dummy.zip")
         with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
@@ -200,7 +200,7 @@ def test_load_osw_file_logs_json_decode_error(self):
         self.assertIsNone(issue["feature_index"])
         self.assertEqual(issue["error_message"], message)
 
-    def test_load_osw_file_logs_os_error(self):
+    def test_load_osw_file_reports_os_error(self):
         """Missing files should log a readable OS error message."""
         validator = OSWValidation(zipfile_path="dummy.zip")
         missing_path = os.path.join(tempfile.gettempdir(), "nonexistent_osw_file.geojson")
@@ -252,7 +252,7 @@ def test_validate_reports_json_decode_error(self):
         finally:
             os.unlink(bad_path)
 
-    def test_validate_logs_read_file_exception(self):
+    def test_validate_reports_read_file_exception(self):
         """GeoDataFrame read failures are logged and do not crash."""
         fake_files = ["/tmp/edges.geojson"]
 
@@ -274,8 +274,8 @@ def test_validate_logs_read_file_exception(self):
         self.assertTrue(any("Failed to read 'edges.geojson' as GeoJSON: boom" in e for e in (res.errors or [])),
                         f"Errors were: {res.errors}")
 
-    def test_missing_w_id_logs_error(self):
-        """Zones missing _w_id should log a clear message."""
+    def test_missing_w_id_reports_error(self):
+        """Zones missing _w_id should report a clear message."""
         fake_files = ["/tmp/nodes.geojson", "/tmp/zones.geojson"]
         nodes = self._gdf_nodes([1, 2])
         # zones without _w_id column
@@ -310,8 +310,8 @@ def _rf(path):
         self.assertTrue(any("Missing required column '_w_id' in zones." in e for e in (res.errors or [])),
                         f"Errors were: {res.errors}")
 
-    def test_extension_read_failure_is_logged(self):
-        """Failure reading an extension file should be logged and skipped."""
+    def test_extension_read_failure_reports_error(self):
+        """Failure reading an extension file should be reported and skipped."""
         fake_files = ["/tmp/nodes.geojson"]
         nodes = self._gdf_nodes([1])
         ext_path = "/tmp/custom.geojson"
@@ -346,8 +346,8 @@ def _rf(path):
         self.assertTrue(any("Failed to read extension 'custom.geojson' as GeoJSON: boom" in e for e in (res.errors or [])),
                         f"Errors were: {res.errors}")
 
-    def test_extension_invalid_ids_logging_failure(self):
-        """If invalid extension features exist but id extraction fails, we log gracefully."""
+    def test_extension_invalid_ids_reports_extraction_failure(self):
+        """If invalid extension features exist but id extraction fails, we report gracefully."""
         ext_path = "/tmp/custom.geojson"
         fake_files = ["/tmp/nodes.geojson"]
         nodes = self._gdf_nodes([1])
@@ -392,6 +392,94 @@ def _rf(path):
                             for e in (res.errors or [])),
                         f"Errors were: {res.errors}")
 
+    def test_extension_invalid_geometries_reported(self):
+        """Invalid geometries inside an extension file should surface a clear error."""
+        ext_path = "/tmp/custom.geojson"
+        fake_files = ["/tmp/nodes.geojson"]
+        nodes = self._gdf_nodes([1])
+
+        invalid_geojson = MagicMock()
+        invalid_geojson.__len__.return_value = 2
+        invalid_geojson.get.return_value = ["a", "b"]
+
+        extension_file = MagicMock()
+        extension_file.is_valid = [False, False]
+        extension_file.__getitem__.return_value = invalid_geojson
+        extension_file.drop.return_value = pd.DataFrame()
+
+        with patch(_PATCH_ZIP) as PZip, \
+             patch(_PATCH_EV) as PVal, \
+             patch(_PATCH_VALIDATE, return_value=True), \
+             patch(_PATCH_READ_FILE) as PRead, \
+             patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
+
+            z = MagicMock()
+            z.extract_zip.return_value = "/tmp/extracted"
+            z.remove_extracted_files.return_value = None
+            PZip.return_value = z
+
+            val = self._fake_validator(fake_files, external_exts=[ext_path])
+            PVal.return_value = val
+
+            def _rf(path):
+                b = os.path.basename(path)
+                if "nodes" in b:
+                    return nodes
+                if os.path.basename(path) == os.path.basename(ext_path):
+                    return extension_file
+                return gpd.GeoDataFrame()
+
+            PRead.side_effect = _rf
+
+            res = OSWValidation(zipfile_path="dummy.zip").validate()
+
+        self.assertFalse(res.is_valid)
+        self.assertTrue(any("Invalid geometries found in extension file `custom.geojson`" in e
+                            for e in (res.errors or [])),
+                        f"Errors were: {res.errors}")
+
+    def test_extension_serialization_failure_reported(self):
+        """Non-serializable extension properties should be reported."""
+        ext_path = "/tmp/custom.geojson"
+        fake_files = ["/tmp/nodes.geojson"]
+        nodes = self._gdf_nodes([1])
+
+        extension_file = MagicMock()
+        extension_file.is_valid = [True]
+        extension_file.__getitem__.return_value = gpd.GeoDataFrame()  # no invalid geometries
+        extension_file.drop.side_effect = Exception("serialize boom")
+
+        with patch(_PATCH_ZIP) as PZip, \
+             patch(_PATCH_EV) as PVal, \
+             patch(_PATCH_VALIDATE, return_value=True), \
+             patch(_PATCH_READ_FILE) as PRead, \
+             patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
+
+            z = MagicMock()
+            z.extract_zip.return_value = "/tmp/extracted"
+            z.remove_extracted_files.return_value = None
+            PZip.return_value = z
+
+            val = self._fake_validator(fake_files, external_exts=[ext_path])
+            PVal.return_value = val
+
+            def _rf(path):
+                b = os.path.basename(path)
+                if "nodes" in b:
+                    return nodes
+                if os.path.basename(path) == os.path.basename(ext_path):
+                    return extension_file
+                return gpd.GeoDataFrame()
+
+            PRead.side_effect = _rf
+
+            res = OSWValidation(zipfile_path="dummy.zip").validate()
+
+        self.assertFalse(res.is_valid)
+        self.assertTrue(any("Extension file `custom.geojson` has non-serializable properties: serialize boom" in e
+                            for e in (res.errors or [])),
+                        f"Errors were: {res.errors}")
+
     def test_duplicate_ids_detection(self):
         """Duplicates inside a single file are reported."""
         fake_files = ["/tmp/nodes.geojson"]
@@ -554,7 +642,7 @@ def test_get_colset_returns_set_when_column_present(self):
         s = v._get_colset(gdf, "_id", "nodes")
         self.assertEqual(s, {1, 2, 3})
 
-    def test_get_colset_logs_and_returns_empty_when_missing(self):
+    def test_get_colset_reports_missing_column(self):
         v = OSWValidation(zipfile_path="dummy.zip")
         gdf = self._gdf({"foo": [1, 2]}, geom="Point")
         s = v._get_colset(gdf, "_id", "nodes")
@@ -575,7 +663,7 @@ def test_get_colset_with_none_gdf(self):
         s = v._get_colset(None, "_id", "nodes")
         self.assertEqual(s, set())
 
-    def test_get_colset_logs_when_stringify_fails(self):
+    def test_get_colset_reports_stringify_failure(self):
         class BadObj:
             def __hash__(self):
                 raise TypeError("no hash")
@@ -590,6 +678,16 @@ def __str__(self):
         self.assertTrue(any("Could not create set for column 'meta' in nodes." in e for e in (v.errors or [])),
                         f"Errors were: {v.errors}")
 
+    def test_load_osw_schema_reports_missing_file(self):
+        v = OSWValidation(zipfile_path="dummy.zip")
+        missing_schema = os.path.join(tempfile.gettempdir(), "missing_schema.json")
+        if os.path.exists(missing_schema):
+            os.unlink(missing_schema)
+        with self.assertRaises(Exception):
+            v.load_osw_schema(missing_schema)
+        self.assertTrue(any("Invalid or missing schema file" in e for e in (v.errors or [])),
+                        f"Errors were: {v.errors}")
+
     def test_schema_02_rejects_tree_and_custom(self):
         base = {
             "$schema": "https://sidewalks.washington.edu/opensidewalks/0.2/schema.json",
@@ -758,6 +856,32 @@ def test_pick_schema_force_single_schema_override(self):
         self.assertEqual(v.pick_schema_for_file("/tmp/my.edges.geojson", {"features": []}), force)
         self.assertEqual(v.pick_schema_for_file("/any/path.json", {"features": [{"geometry": {"type": "Point"}}]}), force)
 
+    def test_unexpected_exception_surfaces_unable_to_validate(self):
+        """Any unexpected exception should be surfaced via 'Unable to validate'."""
+        fake_files = ["/tmp/nodes.geojson"]
+        with patch(_PATCH_ZIP) as PZip, \
+             patch(_PATCH_EV) as PVal, \
+             patch(_PATCH_VALIDATE, side_effect=RuntimeError("boom")), \
+             patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
+
+            z = MagicMock()
+            z.extract_zip.return_value = "/tmp/extracted"
+            z.remove_extracted_files.return_value = None
+            PZip.return_value = z
+
+            val = MagicMock()
+            val.files = fake_files
+            val.externalExtensions = []
+            val.is_valid.return_value = True
+            val.error = None
+            PVal.return_value = val
+
+            res = OSWValidation(zipfile_path="dummy.zip").validate()
+
+        self.assertFalse(res.is_valid)
+        self.assertTrue(any("Unable to validate: boom" in e for e in (res.errors or [])),
+                        f"Errors were: {res.errors}")
+
 
 class TestOSWValidationInvalidGeometryLogging(unittest.TestCase):
     """Covers the 'invalid geometries' logging branch, including _id-present and _id-missing fallback."""
diff --git a/unit_test_overview.md b/unit_test_overview.md
new file mode 100644
index 0000000..36db5bb
--- /dev/null
+++ b/unit_test_overview.md
@@ -0,0 +1,78 @@
+# Unit Test Scenario Counts
+
+Below is a breakdown of all test scenarios (functions named `test_`) across the suite, plus a short description of the main areas they cover.
+
+- **Total scenarios:** 116
+
+| Test module | Scenarios | Focus |
+| --- | ---: | --- |
+| tests/test_schema_parity.py | 14 | Schema parity and customized-field handling |
+| tests/unit_tests/test_extracted_data_validator.py | 11 | Folder/filename validation, duplicates/missing files, unsupported files |
+| tests/unit_tests/test_helpers.py | 14 | Helper utilities (pretty messages, rankings, feature index extraction) |
+| tests/unit_tests/test_osw_validation.py | 34 | End-to-end ZIP validation across assets (valid/invalid, schemas, serialization, unmatched IDs) |
+| tests/unit_tests/test_osw_validation_extras.py | 34 | Edge cases: foreign-key checks, duplicate IDs, invalid geometries, extension file failures, JSON/OS errors, 0.2 disallowed content, unexpected exceptions |
+| tests/unit_tests/test_schema_definitions.py | 1 | Schema definition smoke check |
+| tests/unit_tests/test_schema_metadata.py | 3 | Metadata/heuristic schema selection |
+| tests/unit_tests/test_zipfile_handler.py | 5 | Zip extraction/creation lifecycle |
+
+Method used: counted functions matching `def test_` within `tests/`.
+
+## Detail for heavy-hitter suites
+
+### tests/unit_tests/test_osw_validation.py (34 scenarios)
+- Zip validation success/failure paths: `test_valid_zipfile`, `test_valid_zipfile_with_schema`, `test_valid_zipfile_with_invalid_schema`, `test_minimal_zipfile*`, `test_invalid_zipfile*` (with/without schemas, error cap checks).
+- Dataset-specific invalid archives: `test_nodes_invalid_zipfile*`, `test_edges_invalid_zipfile*`, `test_points_invalid_zipfile*`.
+- External extensions: `test_external_extension_file_inside_zipfile*` (default/override/invalid schema).
+- Schema-level failures: missing `_id` (`test_id_missing_zipfile`), extra fields, invalid geometry, missing identifiers, no entities, wrong datatypes.
+- Zones coverage: `test_valid_zones_file`, `test_invalid_zones_file`.
+- Serialization error surfaced: `test_invalid_serialization_file`.
+- Foreign-key cap: `test_unmatched_ids_limited_to_20`.
+
+### tests/unit_tests/test_osw_validation_extras.py (34 scenarios)
+- Foreign-key checks: `test_missing_u_id_reports_error_without_keyerror`, `test_unmatched_u_id_is_limited_to_20`, `test_unmatched_w_id_is_limited_to_20`.
+- JSON/IO parsing: `test_load_osw_file_reports_json_decode_error`, `test_load_osw_file_reports_os_error`, `test_validate_reports_json_decode_error`.
+- GeoDataFrame read failures: `test_validate_reports_read_file_exception`.
+- Zones column presence: `test_missing_w_id_reports_error`.
+- Extension handling: read failure, invalid IDs, invalid geometries, serialization failure (`test_extension_*` group).
+- Duplicate IDs: `test_duplicate_ids_detection`, `test_duplicate_ids_detection_is_limited_to_20`.
+- Invalid geometries: `_id` present vs missing with cap (`test_invalid_geometry_logs_*`).
+- Helpers and selection: `test_pick_schema_*`, `_get_colset*`, `test_load_osw_schema_reports_missing_file`.
+- 0.2 disallowed content vs 0.3 allowed: `test_schema_02_rejects_tree_and_custom`, `test_schema_03_with_tree_tags_is_allowed`.
+- Robustness: cleanup handling, zip extract failure, invalid folder structure, unexpected exception path (`test_cleanup_handles_locals_membership_error`, `test_zip_extract_failure_bubbles_as_error`, `test_extracted_data_validator_invalid`, `test_unexpected_exception_surfaces_unable_to_validate`, `test_issues_populated_for_invalid_zip`).
+
+### tests/unit_tests/test_extracted_data_validator.py (11 scenarios)
+- Valid layouts at root or nested, empty/invalid directories, and no-geojson detection.
+- Duplicate detection and missing required files (patched OSW_DATASET_FILES).
+- Non-standard filenames rejected; valid subsets of canonical OSW files accepted.
+- Unsupported GeoJSON types (non OSW keys) are rejected.
+
+### tests/test_schema_parity.py (14 scenarios)
+- Custom feature acceptance/rejection across nodes/edges/points/lines/polygons/zones.
+- 0.2 vs 0.3 parity: custom tags rejected on 0.2 where appropriate; accepted on 0.3.
+- Extension fields allowed/blocked per schema version.
+
+### tests/unit_tests/test_helpers.py (14 scenarios)
+- Feature index extraction robustness across error shapes.
+- Pretty-message compaction (enum, anyOf, ordering, trimming noise).
+- Error ranking/tie-breaking and message selection helpers.
+
+### tests/unit_tests/test_schema_definitions.py (1 scenario)
+- Ensures schema definitions include required fields (smoke check).
+
+### tests/unit_tests/test_schema_metadata.py (3 scenarios)
+- Enforces schema requirement by geometry type (Point/LineString/Polygon) when schema URL missing.
+
+### tests/unit_tests/test_zipfile_handler.py (5 scenarios)
+- Zip extraction success/failure, cleanup of extracted files.
+- Zip creation happy path and failure path.
+
+## Coverage snapshot
+
+Name | Stmts | Miss | Cover
+--- | ---: | ---: | ---:
+src/python_osw_validation/__init__.py | 300 | 8 | 97%
+src/python_osw_validation/extracted_data_validator.py | 90 | 1 | 99%
+src/python_osw_validation/helpers.py | 54 | 2 | 96%
+src/python_osw_validation/version.py | 1 | 0 | 100%
+src/python_osw_validation/zipfile_handler.py | 48 | 1 | 98%
+**TOTAL** | 493 | 12 | 98%