Skip to content

Commit 48b8abe

Browse files
authored
Merge pull request #43 from TaskarCenterAtUW/develop
[0.2.14] Develop to Main
2 parents a711551 + 7cd2d94 commit 48b8abe

File tree

4 files changed

+161
-16
lines changed

4 files changed

+161
-16
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Change log
22

3+
### 0.2.14
4+
- Improved GeoJSON parse error reporting with detailed file, line, and column context.
5+
- Added unit tests covering JSON parsing and file read failure scenarios.
6+
- Capped duplicate `_id` validation messages default it to first 20 values while reporting the total duplicate count to avoid excessively large issue payloads.
7+
- Added a new unit test that verifies duplicate ID logging trims the displayed IDs to 20 while reporting the total number of duplicates.
8+
39
### 0.2.13
410
- Updated Schema
511

src/python_osw_validation/__init__.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -202,8 +202,15 @@ def validate(self, max_errors=20) -> ValidationResult:
202202
continue
203203
is_valid, duplicates = self.are_ids_unique(gdf)
204204
if not is_valid:
205+
total_duplicates = len(duplicates)
206+
displayed = ', '.join(map(str, duplicates[:max_errors]))
207+
if total_duplicates > max_errors:
208+
message = (f"Duplicate _id's found in {osw_file}: showing first {max_errors} "
209+
f"of {total_duplicates} duplicates: {displayed}")
210+
else:
211+
message = f"Duplicate _id's found in {osw_file}: {displayed}"
205212
self.log_errors(
206-
message=f"Duplicate _id's found in {osw_file} : {duplicates}",
213+
message=message,
207214
filename=osw_file,
208215
feature_index=None
209216
)
@@ -238,11 +245,11 @@ def validate(self, max_errors=20) -> ValidationResult:
238245
if unmatched:
239246
unmatched_list = list(unmatched)
240247
num_unmatched = len(unmatched_list)
241-
limit = min(num_unmatched, 20)
248+
limit = min(num_unmatched, max_errors)
242249
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
243250
self.log_errors(
244251
message=(f"All _u_id's in edges should be part of _id's mentioned in nodes. "
245-
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
252+
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
246253
f"unmatched _u_id's: {displayed_unmatched}"),
247254
filename='All',
248255
feature_index=None
@@ -253,11 +260,11 @@ def validate(self, max_errors=20) -> ValidationResult:
253260
if unmatched:
254261
unmatched_list = list(unmatched)
255262
num_unmatched = len(unmatched_list)
256-
limit = min(num_unmatched, 20)
263+
limit = min(num_unmatched, max_errors)
257264
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
258265
self.log_errors(
259266
message=(f"All _v_id's in edges should be part of _id's mentioned in nodes. "
260-
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
267+
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
261268
f"unmatched _v_id's: {displayed_unmatched}"),
262269
filename='All',
263270
feature_index=None
@@ -268,11 +275,11 @@ def validate(self, max_errors=20) -> ValidationResult:
268275
if unmatched:
269276
unmatched_list = list(unmatched)
270277
num_unmatched = len(unmatched_list)
271-
limit = min(num_unmatched, 20)
278+
limit = min(num_unmatched, max_errors)
272279
displayed_unmatched = ', '.join(map(str, unmatched_list[:limit]))
273280
self.log_errors(
274281
message=(f"All _w_id's in zones should be part of _id's mentioned in nodes. "
275-
f"Showing {'20' if num_unmatched > 20 else 'all'} out of {num_unmatched} "
282+
f"Showing {max_errors if num_unmatched > max_errors else 'all'} out of {num_unmatched} "
276283
f"unmatched _w_id's: {displayed_unmatched}"),
277284
filename='All',
278285
feature_index=None
@@ -295,10 +302,10 @@ def validate(self, max_errors=20) -> ValidationResult:
295302
ids_series = invalid_geojson['_id'] if '_id' in invalid_geojson.columns else invalid_geojson.index
296303
invalid_ids = list(set(ids_series))
297304
num_invalid = len(invalid_ids)
298-
limit = min(num_invalid, 20)
305+
limit = min(num_invalid, max_errors)
299306
displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
300307
self.log_errors(
301-
message=(f"Showing {'20' if num_invalid > 20 else 'all'} out of {num_invalid} "
308+
message=(f"Showing {max_errors if num_invalid > max_errors else 'all'} out of {num_invalid} "
302309
f"invalid {osw_file} geometries, id's of invalid geometries: {displayed_invalid}"),
303310
filename='All',
304311
feature_index=None
@@ -323,11 +330,11 @@ def validate(self, max_errors=20) -> ValidationResult:
323330
try:
324331
invalid_ids = list(set(invalid_geojson.get('_id', invalid_geojson.index)))
325332
num_invalid = len(invalid_ids)
326-
limit = min(num_invalid, 20)
333+
limit = min(num_invalid, max_errors)
327334
displayed_invalid = ', '.join(map(str, invalid_ids[:limit]))
328335
self.log_errors(
329336
message=(f"Invalid geometries found in extension file `{file_name}`. "
330-
f"Showing {limit if num_invalid > 20 else 'all'} of {num_invalid} "
337+
f"Showing {max_errors if num_invalid > max_errors else 'all'} of {num_invalid} "
331338
f"invalid geometry IDs: {displayed_invalid}"),
332339
filename=file_name,
333340
feature_index=None
@@ -388,8 +395,28 @@ def validate(self, max_errors=20) -> ValidationResult:
388395
gc.collect()
389396

390397
def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
391-
with open(graph_geojson_path, 'r') as file:
392-
return json.load(file)
398+
try:
399+
with open(graph_geojson_path, 'r') as file:
400+
return json.load(file)
401+
except json.JSONDecodeError as e:
402+
filename = os.path.basename(graph_geojson_path)
403+
self.log_errors(
404+
message=(
405+
f"Failed to parse '{filename}' as valid JSON. "
406+
f"{e.msg} (line {e.lineno}, column {e.colno}, char {e.pos})."
407+
),
408+
filename=filename,
409+
feature_index=None,
410+
)
411+
raise
412+
except OSError as e:
413+
filename = os.path.basename(graph_geojson_path)
414+
self.log_errors(
415+
message=f"Unable to read file '{filename}': {e.strerror or e}",
416+
filename=filename,
417+
feature_index=None,
418+
)
419+
raise
393420

394421
def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
395422
"""Validate one OSW GeoJSON against the appropriate schema (streaming).
@@ -399,7 +426,12 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
399426
before returning, pushes a single human-friendly message per feature
400427
into `self.issues` (like your sample: "must include one of: ...").
401428
"""
402-
geojson_data = self.load_osw_file(file_path)
429+
try:
430+
geojson_data = self.load_osw_file(file_path)
431+
except json.JSONDecodeError:
432+
return False
433+
except OSError:
434+
return False
403435
schema_path = self.pick_schema_for_file(file_path, geojson_data)
404436
schema = self.load_osw_schema(schema_path)
405437
validator = jsonschema_rs.Draft7Validator(schema)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.2.13'
1+
__version__ = '0.2.14'

tests/unit_tests/test_osw_validation_extras.py

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import json
12
import os
3+
import tempfile
24
import unittest
35
from unittest.mock import patch, MagicMock
46
import geopandas as gpd
@@ -172,6 +174,83 @@ def _rf(path):
172174
shown_ids = [x.strip() for x in displayed.split(",")]
173175
self.assertLessEqual(len(shown_ids), 20)
174176

177+
def test_load_osw_file_logs_json_decode_error(self):
178+
"""Invalid JSON should surface a detailed message with location context."""
179+
validator = OSWValidation(zipfile_path="dummy.zip")
180+
with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
181+
tmp.write('{"features": [1, }')
182+
bad_path = tmp.name
183+
try:
184+
with self.assertRaises(json.JSONDecodeError):
185+
validator.load_osw_file(bad_path)
186+
finally:
187+
os.unlink(bad_path)
188+
189+
self.assertTrue(any("Failed to parse" in e for e in (validator.errors or [])),
190+
f"Errors were: {validator.errors}")
191+
message = validator.errors[-1]
192+
basename = os.path.basename(bad_path)
193+
self.assertIn(basename, message)
194+
self.assertIn("line", message)
195+
self.assertIn("column", message)
196+
197+
issue = validator.issues[-1]
198+
self.assertEqual(issue["filename"], basename)
199+
self.assertIsNone(issue["feature_index"])
200+
self.assertEqual(issue["error_message"], message)
201+
202+
def test_load_osw_file_logs_os_error(self):
203+
"""Missing files should log a readable OS error message."""
204+
validator = OSWValidation(zipfile_path="dummy.zip")
205+
missing_path = os.path.join(tempfile.gettempdir(), "nonexistent_osw_file.geojson")
206+
if os.path.exists(missing_path):
207+
os.unlink(missing_path)
208+
209+
with self.assertRaises(OSError):
210+
validator.load_osw_file(missing_path)
211+
212+
self.assertTrue(any("Unable to read file" in e for e in (validator.errors or [])),
213+
f"Errors were: {validator.errors}")
214+
message = validator.errors[-1]
215+
basename = os.path.basename(missing_path)
216+
self.assertIn(basename, message)
217+
self.assertIn("Unable to read file", message)
218+
219+
issue = validator.issues[-1]
220+
self.assertEqual(issue["filename"], basename)
221+
self.assertIsNone(issue["feature_index"])
222+
self.assertEqual(issue["error_message"], message)
223+
224+
def test_validate_reports_json_decode_error(self):
225+
"""Full validation flow should surface parse errors before schema checks."""
226+
with tempfile.NamedTemporaryFile("w", suffix=".geojson", delete=False) as tmp:
227+
tmp.write('{"type": "FeatureCollection", "features": [1, }')
228+
bad_path = tmp.name
229+
230+
try:
231+
with patch(_PATCH_ZIP) as PZip, patch(_PATCH_EV) as PVal:
232+
z = MagicMock()
233+
z.extract_zip.return_value = "/tmp/extracted"
234+
z.remove_extracted_files.return_value = None
235+
PZip.return_value = z
236+
237+
PVal.return_value = self._fake_validator([bad_path])
238+
239+
result = OSWValidation(zipfile_path="dummy.zip").validate()
240+
241+
self.assertFalse(result.is_valid)
242+
message = next((e for e in (result.errors or []) if "Failed to parse" in e), None)
243+
self.assertIsNotNone(message, f"Expected parse error message. Errors: {result.errors}")
244+
basename = os.path.basename(bad_path)
245+
self.assertIn(basename, message)
246+
self.assertIn("line", message)
247+
248+
issue = next((i for i in (result.issues or []) if i["filename"] == basename), None)
249+
self.assertIsNotNone(issue, f"Issues were: {result.issues}")
250+
self.assertEqual(issue["error_message"], message)
251+
finally:
252+
os.unlink(bad_path)
253+
175254
def test_duplicate_ids_detection(self):
176255
"""Duplicates inside a single file are reported."""
177256
fake_files = ["/tmp/nodes.geojson"]
@@ -190,7 +269,35 @@ def test_duplicate_ids_detection(self):
190269

191270
res = OSWValidation(zipfile_path="dummy.zip").validate()
192271
self.assertFalse(res.is_valid)
193-
self.assertTrue(any("Duplicate _id's found in nodes" in e for e in (res.errors or [])))
272+
msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
273+
self.assertEqual(msg, "Duplicate _id's found in nodes: 2")
274+
275+
def test_duplicate_ids_detection_is_limited_to_20(self):
276+
"""Duplicate messages cap the number of displayed IDs."""
277+
fake_files = ["/tmp/nodes.geojson"]
278+
duplicate_ids = [f"id{i}" for i in range(25) for _ in (0, 1)] # 25 unique duplicates
279+
nodes = self._gdf_nodes(duplicate_ids)
280+
281+
with patch(_PATCH_ZIP) as PZip, \
282+
patch(_PATCH_EV) as PVal, \
283+
patch(_PATCH_VALIDATE, return_value=True), \
284+
patch(_PATCH_READ_FILE, return_value=nodes), \
285+
patch(_PATCH_DATASET_FILES, _CANON_DATASET_FILES):
286+
z = MagicMock()
287+
z.extract_zip.return_value = "/tmp/extracted"
288+
PZip.return_value = z
289+
PVal.return_value = self._fake_validator(fake_files)
290+
291+
res = OSWValidation(zipfile_path="dummy.zip").validate()
292+
self.assertFalse(res.is_valid)
293+
msg = next((e for e in (res.errors or []) if "Duplicate _id's found in nodes" in e), None)
294+
self.assertIsNotNone(msg, "Expected duplicate-id error not found")
295+
self.assertIn("showing first 20 of 25 duplicates", msg)
296+
displayed = msg.split("duplicates:")[-1].strip()
297+
shown_ids = [part.strip() for part in displayed.split(",") if part.strip()]
298+
self.assertLessEqual(len(shown_ids), 20)
299+
expected_ids = [f"id{i}" for i in range(20)]
300+
self.assertEqual(shown_ids, expected_ids)
194301

195302
def test_pick_schema_by_geometry_and_by_filename(self):
196303
"""Point/LineString/Polygon ⇒ proper schema; filename fallback when features empty."""

0 commit comments

Comments
 (0)