Skip to content

Commit 2eeda7f

Browse files
committed
- Clear error messaging when a zip file is empty
- Consolidated multiple lists of OSW data files into one location - Prohibit duplicate OSW data files (i.e. one edges file allowed only) - Clearer error messaging - Aggregate schema errors and data integrity errors separately before returning errors to user - Added unit tests for external extensions - Updated unit tests and results - Upgraded all test assets to OSW 0.2 whenever possible
1 parent 8a5ac65 commit 2eeda7f

22 files changed

+118
-94
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@
2525
- Made all OSW files optional
2626
- Added additional validation steps based on the OSW network properties
2727
- Add external extensions to ExtractedDataValidator
28-
- Validate external extensions against basic Open Geospatial Consortium (OGC) standards
28+
- Validate external extensions against basic Open Geospatial Consortium (OGC) standards
29+
- Aggregate schema errors and data integrity errors separately before returning errors to user

README.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,19 @@ folder.
6060
```shell
6161

6262
> coverage run --source=src/python_osw_validation -m unittest discover -v tests/unit_tests
63-
test_invalid_empty_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
64-
test_invalid_missing_files_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
65-
test_invalid_missing_required_files_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
66-
test_valid_directory_structure (test_extracted_data_validator.TestExtractedDataValidator) ... ok
63+
test_duplicate_files (test_extracted_data_validator.TestExtractedDataValidator) ... ok
64+
test_empty_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
65+
test_invalid_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
66+
test_missing_optional_file (test_extracted_data_validator.TestExtractedDataValidator) ... ok
67+
test_no_geojson_files (test_extracted_data_validator.TestExtractedDataValidator) ... ok
68+
test_valid_data_at_root (test_extracted_data_validator.TestExtractedDataValidator) ... ok
69+
test_valid_data_inside_folder (test_extracted_data_validator.TestExtractedDataValidator) ... ok
6770
test_edges_invalid_zipfile (test_osw_validation.TestOSWValidation) ... ok
6871
test_edges_invalid_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
6972
test_edges_invalid_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
73+
test_external_extension_file_inside_zipfile (test_osw_validation.TestOSWValidation) ... ok
74+
test_external_extension_file_inside_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
75+
test_external_extension_file_inside_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
7076
test_extra_field_zipfile (test_osw_validation.TestOSWValidation) ... ok
7177
test_id_missing_zipfile (test_osw_validation.TestOSWValidation) ... ok
7278
test_invalid_geometry_zipfile (test_osw_validation.TestOSWValidation) ... ok
@@ -76,9 +82,6 @@ test_invalid_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
7682
test_minimal_zipfile (test_osw_validation.TestOSWValidation) ... ok
7783
test_minimal_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
7884
test_minimal_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
79-
test_missing_files_inside_zipfile (test_osw_validation.TestOSWValidation) ... ok
80-
test_missing_files_inside_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
81-
test_missing_files_inside_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
8285
test_missing_identifier_zipfile (test_osw_validation.TestOSWValidation) ... ok
8386
test_no_entity_zipfile (test_osw_validation.TestOSWValidation) ... ok
8487
test_nodes_invalid_zipfile (test_osw_validation.TestOSWValidation) ... ok
@@ -96,7 +99,7 @@ test_extract_valid_zip (test_zipfile_handler.TestZipFileHandler) ... ok
9699
test_remove_extracted_files (test_zipfile_handler.TestZipFileHandler) ... ok
97100

98101
----------------------------------------------------------------------
99-
Ran 34 tests in 121.220s
102+
Ran 37 tests in 1284.068s
100103

101104
OK
102105
```

src/python_osw_validation/__init__.py

Lines changed: 22 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,10 @@
44
from typing import Dict, Any, Optional, List
55
import geopandas as gpd
66
from .zipfile_handler import ZipFileHandler
7-
from .extracted_data_validator import ExtractedDataValidator
7+
from .extracted_data_validator import ExtractedDataValidator, OSW_dataset_files
88

99
SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
1010

11-
OSW_dataset_files = {"edges": {
12-
"required": False,
13-
"geometry": "LineString"
14-
},
15-
"nodes": {
16-
"required": False,
17-
"geometry": "Point"
18-
},
19-
"points": {
20-
"required": False,
21-
"geometry": "Point"
22-
},
23-
"lines": {
24-
"required": False,
25-
"geometry": "LineString"
26-
},
27-
"zones": {
28-
"required": False,
29-
"geometry": "Polygon"
30-
},
31-
"polygons": {
32-
"required": False,
33-
"geometry": "Polygon"
34-
}
35-
}
36-
3711

3812
class ValidationResult:
3913
def __init__(self, is_valid: bool, errors: Optional[List[str]] = None):
@@ -62,7 +36,7 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
6236
self.errors.append(f'Invalid or missing schema file: {e}')
6337
raise Exception(f'Invalid or missing schema file: {e}')
6438

65-
def unique_id(self, gdf):
39+
def are_ids_unique(self, gdf):
6640
"""Check for duplicate values in the _id field"""
6741
duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique()
6842

@@ -85,28 +59,26 @@ def validate(self) -> ValidationResult:
8559
if not validator.is_valid():
8660
self.errors.append(validator.error)
8761
return ValidationResult(False, self.errors)
88-
8962
for file in validator.files:
9063
file_path = os.path.join(file)
91-
is_valid = self.validate_osw_errors(self.load_osw_file(file_path))
92-
if not is_valid:
93-
zip_handler.remove_extracted_files()
94-
return ValidationResult(False, self.errors)
64+
is_valid = self.validate_osw_errors(file_path)
65+
66+
if self.errors:
67+
zip_handler.remove_extracted_files()
68+
return ValidationResult(False, self.errors)
9569

9670
# Validate data integrity
9771
OSW_dataset = {}
9872
for file in validator.files:
9973
file_path = os.path.join(file)
100-
osw_file = file_path.split('.')[-2]
74+
osw_file = next((osw_file_any for osw_file_any in OSW_dataset_files.keys() if osw_file_any in file_path), '')
10175
OSW_dataset[osw_file] = gpd.read_file(file_path)
10276

10377
# Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
10478
for osw_file in OSW_dataset:
105-
is_valid, duplicates = self.unique_id(OSW_dataset[osw_file])
79+
is_valid, duplicates = self.are_ids_unique(OSW_dataset[osw_file])
10680
if not is_valid:
107-
zip_handler.remove_extracted_files()
10881
self.errors.append(f"Duplicate _id's found in {osw_file} : {duplicates}")
109-
return ValidationResult(False, self.errors)
11082

11183
# Create sets of node id's and foreign keys to be used in validation
11284
if "nodes" in OSW_dataset:
@@ -130,47 +102,41 @@ def validate(self) -> ValidationResult:
130102
unmatched = node_ids_edges_u - node_ids
131103
is_valid = len(unmatched) == 0
132104
if not is_valid:
133-
zip_handler.remove_extracted_files()
134-
self.errors.append(f"Foreign key constraints for edge start nodes failed, _u_id's of unmatched nodes: {unmatched}")
135-
return ValidationResult(False, self.errors)
105+
self.errors.append(f"All _u_id's in edges should be part of _id's mentioned in nodes, _u_id's not in nodes are: {unmatched}")
136106

137107
# Do all node references in _v_id exist in nodes?
138108
unmatched = node_ids_edges_v - node_ids
139109
is_valid = len(unmatched) == 0
140110
if not is_valid:
141-
zip_handler.remove_extracted_files()
142-
self.errors.append(f"Foreign key constraints for edge end nodes failed, _v_id's of unmatched nodes: {unmatched}")
143-
return ValidationResult(False, self.errors)
111+
self.errors.append(f"All _v_id's in edges should be part of _id's mentioned in nodes, _v_id's not in nodes are: {unmatched}")
144112

145113
# Do all node references in _w_id exist in nodes?
146114
unmatched = node_ids_zones_w - node_ids
147115
is_valid = len(unmatched) == 0
148116
if not is_valid:
149-
zip_handler.remove_extracted_files()
150-
self.errors.append(f"Foreign key constraints for zone nodes failed, _w_id's of unmatched nodes: {unmatched}")
151-
return ValidationResult(False, self.errors)
152-
117+
self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")
118+
153119
# Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
154120
for osw_file in OSW_dataset:
155121
invalid_geojson = OSW_dataset[osw_file][(OSW_dataset[osw_file].geometry.type != OSW_dataset_files[osw_file]['geometry']) | (OSW_dataset[osw_file].is_valid==False)]
156122
is_valid = len(invalid_geojson) == 0
157123
if not is_valid:
158-
zip_handler.remove_extracted_files()
159124
self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
160-
return ValidationResult(False, self.errors)
161-
125+
162126
# Validate OSW external extensions
163127
for file in validator.externalExtensions:
164128
file_path = os.path.join(file)
165129
extensionFile = gpd.read_file(file_path)
166130
invalid_geojson = extensionFile[extensionFile.is_valid==False]
167131
is_valid = len(invalid_geojson) == 0
168132
if not is_valid:
169-
zip_handler.remove_extracted_files()
170133
self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")
171-
return ValidationResult(False, self.errors)
172-
173-
return ValidationResult(True)
134+
135+
if self.errors:
136+
zip_handler.remove_extracted_files()
137+
return ValidationResult(False, self.errors)
138+
else:
139+
return ValidationResult(True)
174140
except Exception as e:
175141
self.errors.append(f'Unable to validate: {e}')
176142
return ValidationResult(False, self.errors)
@@ -180,8 +146,9 @@ def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
180146
with open(graph_geojson_path, 'r') as file:
181147
return json.load(file)
182148

183-
def validate_osw_errors(self, geojson_data: Dict[str, Any]) -> bool:
149+
def validate_osw_errors(self, file_path: str) -> bool:
184150
'''Validate OSW Data against the schema and process all errors'''
151+
geojson_data = self.load_osw_file(file_path)
185152
validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path))
186153
errors = list(validator.iter_errors(geojson_data))
187154

src/python_osw_validation/extracted_data_validator.py

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,33 @@
22
import glob
33

44

5+
OSW_dataset_files = {"edges": {
6+
"required": False,
7+
"geometry": "LineString"
8+
},
9+
"nodes": {
10+
"required": False,
11+
"geometry": "Point"
12+
},
13+
"points": {
14+
"required": False,
15+
"geometry": "Point"
16+
},
17+
"lines": {
18+
"required": False,
19+
"geometry": "LineString"
20+
},
21+
"zones": {
22+
"required": False,
23+
"geometry": "Polygon"
24+
},
25+
"polygons": {
26+
"required": False,
27+
"geometry": "Polygon"
28+
}
29+
}
30+
31+
532
class ExtractedDataValidator:
633
def __init__(self, extracted_dir: str):
734
self.extracted_dir = extracted_dir
@@ -26,23 +53,45 @@ def is_valid(self) -> bool:
2653
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
2754
return False
2855

29-
required_files = {}
30-
optional_files = {'nodes', 'edges', 'points', 'lines', 'zones', 'polygons'}
31-
for filename in geojson_files:
32-
base_name = os.path.basename(filename)
33-
for required_file in required_files:
56+
required_files = [key for key, value in OSW_dataset_files.items() if value['required']]
57+
optional_files = [key for key, value in OSW_dataset_files.items() if not value['required']]
58+
missing_files = []
59+
duplicate_files = []
60+
for required_file in required_files:
61+
file_count = 0
62+
for filename in geojson_files:
63+
base_name = os.path.basename(filename)
3464
if required_file in base_name and base_name.endswith('.geojson'):
35-
self.files.append(filename)
36-
required_files.remove(required_file)
37-
break
38-
for optional_file in optional_files:
65+
file_count += 1
66+
save_filename = filename
67+
if file_count == 0:
68+
# Missing required file
69+
missing_files.append(required_file)
70+
elif file_count == 1:
71+
self.files.append(save_filename)
72+
else:
73+
# Duplicate file
74+
duplicate_files.append(required_file)
75+
76+
for optional_file in optional_files:
77+
file_count = 0
78+
for filename in geojson_files:
79+
base_name = os.path.basename(filename)
3980
if optional_file in base_name and base_name.endswith('.geojson'):
40-
self.files.append(filename)
41-
optional_files.remove(optional_file)
42-
break
81+
file_count += 1
82+
save_filename = filename
83+
if file_count == 1:
84+
self.files.append(save_filename)
85+
elif file_count > 1:
86+
# Duplicate file
87+
duplicate_files.append(optional_file)
4388

44-
if required_files:
45-
self.error = f'Missing required .geojson files: {", ".join(required_files)}.'
89+
if missing_files:
90+
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
91+
return False
92+
93+
if duplicate_files:
94+
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
4695
return False
4796

4897
# Add OSW external extensions, GeoJSON files we know nothing about

src/python_osw_validation/zipfile_handler.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ def extract_zip(self) -> Optional[str]:
4747
with zipfile.ZipFile(self.zip_file_path, "r") as zip_ref:
4848
zip_ref.extractall(self.extracted_dir)
4949

50+
if len(zip_ref.namelist()) == 0:
51+
raise Exception('ZIP file is empty')
52+
5053
first_item = zip_ref.namelist()[0]
5154

5255
return f'{self.extracted_dir}/{first_item}'

tests/assets/_id_missing.zip

37.2 KB
Binary file not shown.

tests/assets/edges_invalid.zip

5.2 KB
Binary file not shown.
980 Bytes
Binary file not shown.

tests/assets/extra_field.zip

37.2 KB
Binary file not shown.

tests/assets/invalid.zip

5.2 KB
Binary file not shown.

0 commit comments

Comments
 (0)