Skip to content

Commit f5473df

Browse files
authored
Merge pull request #11 from TaskarCenterAtUW/develop
Develop to Main
2 parents d7af6b8 + beabf4b commit f5473df

28 files changed

+3619
-455
lines changed

.github/workflows/deploy_to_test.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ jobs:
2323
- name: Installing git
2424
run: pip install gitpython
2525

26+
- name: Installing Packages
27+
run: pip install -r requirements.txt
28+
2629
- name: Generating version file
2730
run: python freeze_version.py
2831

.github/workflows/publish_to_pypi.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ jobs:
2222
- name: Installing git
2323
run: pip install gitpython
2424

25+
- name: Installing Packages
26+
run: pip install -r requirements.txt
27+
2528
- name: Generate local version
2629
run: python freeze_version.py
2730

CHANGELOG.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,13 @@
1717
- Support for multi-level geojson file
1818
- Now handles the following two folder structures when unzipped abc.zip
1919
1. abc\{nodes, edges, points}.geojson
20-
2. {nodes, edges, points}.geojson
20+
2. {nodes, edges, points}.geojson
21+
22+
### 0.2.0
23+
- Updated schema file to OSW 0.2
24+
- Added create_zip method to ZipFileHandler
25+
- Made all OSW files optional
26+
- Added additional validation steps based on the OSW network properties
27+
- Add external extensions to ExtractedDataValidator
28+
- Validate external extensions against basic Open Geospatial Consortium (OGC) standards
29+
- Aggregate schema errors and data integrity errors separately before returning errors to user

README.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,19 @@ folder.
6060
```shell
6161

6262
> coverage run --source=src/python_osw_validation -m unittest discover -v tests/unit_tests
63-
test_invalid_empty_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
64-
test_invalid_missing_files_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
65-
test_invalid_missing_required_files_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
66-
test_valid_directory_structure (test_extracted_data_validator.TestExtractedDataValidator) ... ok
63+
test_duplicate_files (test_extracted_data_validator.TestExtractedDataValidator) ... ok
64+
test_empty_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
65+
test_invalid_directory (test_extracted_data_validator.TestExtractedDataValidator) ... ok
66+
test_missing_optional_file (test_extracted_data_validator.TestExtractedDataValidator) ... ok
67+
test_no_geojson_files (test_extracted_data_validator.TestExtractedDataValidator) ... ok
68+
test_valid_data_at_root (test_extracted_data_validator.TestExtractedDataValidator) ... ok
69+
test_valid_data_inside_folder (test_extracted_data_validator.TestExtractedDataValidator) ... ok
6770
test_edges_invalid_zipfile (test_osw_validation.TestOSWValidation) ... ok
6871
test_edges_invalid_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
6972
test_edges_invalid_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
73+
test_external_extension_file_inside_zipfile (test_osw_validation.TestOSWValidation) ... ok
74+
test_external_extension_file_inside_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
75+
test_external_extension_file_inside_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
7076
test_extra_field_zipfile (test_osw_validation.TestOSWValidation) ... ok
7177
test_id_missing_zipfile (test_osw_validation.TestOSWValidation) ... ok
7278
test_invalid_geometry_zipfile (test_osw_validation.TestOSWValidation) ... ok
@@ -76,9 +82,6 @@ test_invalid_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
7682
test_minimal_zipfile (test_osw_validation.TestOSWValidation) ... ok
7783
test_minimal_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
7884
test_minimal_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
79-
test_missing_files_inside_zipfile (test_osw_validation.TestOSWValidation) ... ok
80-
test_missing_files_inside_zipfile_with_invalid_schema (test_osw_validation.TestOSWValidation) ... ok
81-
test_missing_files_inside_zipfile_with_schema (test_osw_validation.TestOSWValidation) ... ok
8285
test_missing_identifier_zipfile (test_osw_validation.TestOSWValidation) ... ok
8386
test_no_entity_zipfile (test_osw_validation.TestOSWValidation) ... ok
8487
test_nodes_invalid_zipfile (test_osw_validation.TestOSWValidation) ... ok
@@ -96,7 +99,7 @@ test_extract_valid_zip (test_zipfile_handler.TestZipFileHandler) ... ok
9699
test_remove_extracted_files (test_zipfile_handler.TestZipFileHandler) ... ok
97100

98101
----------------------------------------------------------------------
99-
Ran 34 tests in 121.220s
102+
Ran 37 tests in 1284.068s
100103

101104
OK
102105
```

freeze_version.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import git
44
from datetime import date
5+
from src.python_osw_validation.version import __version__
56

67
project_path = os.path.dirname(os.path.abspath(__file__))
78
version_file_path = '{}/version.py'.format(project_path)
@@ -11,7 +12,7 @@
1112

1213
build_date = date.today().strftime('%Y-%m-%d')
1314

14-
version = '0.0.5'
15+
version = __version__
1516

1617
with open(version_file_path, 'w+') as version_file:
1718
version_file.write("version = '{}'\n".format(version))

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
jsonschema==4.19.1
22
zipfile36==0.1.3
3-
coverage==7.2.7
3+
coverage==7.2.7
4+
geopandas==0.13.2

src/python_osw_validation/__init__.py

Lines changed: 86 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import json
33
import jsonschema
44
from typing import Dict, Any, Optional, List
5+
import geopandas as gpd
56
from .zipfile_handler import ZipFileHandler
6-
from .extracted_data_validator import ExtractedDataValidator
7+
from .extracted_data_validator import ExtractedDataValidator, OSW_dataset_files
8+
from .version import __version__
79

810
SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
911

@@ -34,6 +36,14 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
3436
except Exception as e:
3537
self.errors.append(f'Invalid or missing schema file: {e}')
3638
raise Exception(f'Invalid or missing schema file: {e}')
39+
40+
def are_ids_unique(self, gdf):
41+
"""Check for duplicate values in the _id field"""
42+
duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique()
43+
44+
is_valid = len(duplicates) == 0
45+
46+
return is_valid, list(duplicates)
3747

3848
def validate(self) -> ValidationResult:
3949
try:
@@ -50,15 +60,84 @@ def validate(self) -> ValidationResult:
5060
if not validator.is_valid():
5161
self.errors.append(validator.error)
5262
return ValidationResult(False, self.errors)
63+
for file in validator.files:
64+
file_path = os.path.join(file)
65+
is_valid = self.validate_osw_errors(file_path)
5366

67+
if self.errors:
68+
zip_handler.remove_extracted_files()
69+
return ValidationResult(False, self.errors)
70+
71+
# Validate data integrity
72+
OSW_dataset = {}
5473
for file in validator.files:
5574
file_path = os.path.join(file)
56-
is_valid = self.validate_osw_errors(self.load_osw_file(file_path))
75+
osw_file = next((osw_file_any for osw_file_any in OSW_dataset_files.keys() if osw_file_any in file_path), '')
76+
OSW_dataset[osw_file] = gpd.read_file(file_path)
77+
78+
# Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
79+
for osw_file in OSW_dataset:
80+
is_valid, duplicates = self.are_ids_unique(OSW_dataset[osw_file])
5781
if not is_valid:
58-
zip_handler.remove_extracted_files()
59-
return ValidationResult(False, self.errors)
82+
self.errors.append(f"Duplicate _id's found in {osw_file} : {duplicates}")
83+
84+
# Create sets of node id's and foreign keys to be used in validation
85+
if "nodes" in OSW_dataset:
86+
node_ids = set(OSW_dataset['nodes']['_id'])
87+
else:
88+
node_ids = set()
89+
90+
if "edges" in OSW_dataset:
91+
node_ids_edges_u = set(OSW_dataset['edges']['_u_id'])
92+
node_ids_edges_v = set(OSW_dataset['edges']['_v_id'])
93+
else:
94+
node_ids_edges_u = set()
95+
node_ids_edges_v = set()
96+
97+
if "zones" in OSW_dataset:
98+
node_ids_zones_w = set([item for sublist in OSW_dataset['zones']['_w_id'] for item in sublist])
99+
else:
100+
node_ids_zones_w = set()
101+
102+
# Do all node references in _u_id exist in nodes?
103+
unmatched = node_ids_edges_u - node_ids
104+
is_valid = len(unmatched) == 0
105+
if not is_valid:
106+
self.errors.append(f"All _u_id's in edges should be part of _id's mentioned in nodes, _u_id's not in nodes are: {unmatched}")
60107

61-
return ValidationResult(True)
108+
# Do all node references in _v_id exist in nodes?
109+
unmatched = node_ids_edges_v - node_ids
110+
is_valid = len(unmatched) == 0
111+
if not is_valid:
112+
self.errors.append(f"All _v_id's in edges should be part of _id's mentioned in nodes, _v_id's not in nodes are: {unmatched}")
113+
114+
# Do all node references in _w_id exist in nodes?
115+
unmatched = node_ids_zones_w - node_ids
116+
is_valid = len(unmatched) == 0
117+
if not is_valid:
118+
self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")
119+
120+
# Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
121+
for osw_file in OSW_dataset:
122+
invalid_geojson = OSW_dataset[osw_file][(OSW_dataset[osw_file].geometry.type != OSW_dataset_files[osw_file]['geometry']) | (OSW_dataset[osw_file].is_valid==False)]
123+
is_valid = len(invalid_geojson) == 0
124+
if not is_valid:
125+
self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
126+
127+
# Validate OSW external extensions
128+
for file in validator.externalExtensions:
129+
file_path = os.path.join(file)
130+
extensionFile = gpd.read_file(file_path)
131+
invalid_geojson = extensionFile[extensionFile.is_valid==False]
132+
is_valid = len(invalid_geojson) == 0
133+
if not is_valid:
134+
self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")
135+
136+
if self.errors:
137+
zip_handler.remove_extracted_files()
138+
return ValidationResult(False, self.errors)
139+
else:
140+
return ValidationResult(True)
62141
except Exception as e:
63142
self.errors.append(f'Unable to validate: {e}')
64143
return ValidationResult(False, self.errors)
@@ -68,8 +147,9 @@ def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
68147
with open(graph_geojson_path, 'r') as file:
69148
return json.load(file)
70149

71-
def validate_osw_errors(self, geojson_data: Dict[str, Any]) -> bool:
150+
def validate_osw_errors(self, file_path: str) -> bool:
72151
'''Validate OSW Data against the schema and process all errors'''
152+
geojson_data = self.load_osw_file(file_path)
73153
validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path))
74154
errors = list(validator.iter_errors(geojson_data))
75155

src/python_osw_validation/extracted_data_validator.py

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,38 @@
22
import glob
33

44

5+
OSW_dataset_files = {"edges": {
6+
"required": False,
7+
"geometry": "LineString"
8+
},
9+
"nodes": {
10+
"required": False,
11+
"geometry": "Point"
12+
},
13+
"points": {
14+
"required": False,
15+
"geometry": "Point"
16+
},
17+
"lines": {
18+
"required": False,
19+
"geometry": "LineString"
20+
},
21+
"zones": {
22+
"required": False,
23+
"geometry": "Polygon"
24+
},
25+
"polygons": {
26+
"required": False,
27+
"geometry": "Polygon"
28+
}
29+
}
30+
31+
532
class ExtractedDataValidator:
633
def __init__(self, extracted_dir: str):
734
self.extracted_dir = extracted_dir
835
self.files = []
36+
self.externalExtensions = []
937
self.error = None
1038

1139
def is_valid(self) -> bool:
@@ -25,23 +53,48 @@ def is_valid(self) -> bool:
2553
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
2654
return False
2755

28-
required_files = {'nodes', 'edges'}
29-
optional_files = {'points'}
30-
for filename in geojson_files:
31-
base_name = os.path.basename(filename)
32-
for required_file in required_files:
56+
required_files = [key for key, value in OSW_dataset_files.items() if value['required']]
57+
optional_files = [key for key, value in OSW_dataset_files.items() if not value['required']]
58+
missing_files = []
59+
duplicate_files = []
60+
for required_file in required_files:
61+
file_count = 0
62+
for filename in geojson_files:
63+
base_name = os.path.basename(filename)
3364
if required_file in base_name and base_name.endswith('.geojson'):
34-
self.files.append(filename)
35-
required_files.remove(required_file)
36-
break
37-
for optional_file in optional_files:
65+
file_count += 1
66+
save_filename = filename
67+
if file_count == 0:
68+
# Missing required file
69+
missing_files.append(required_file)
70+
elif file_count == 1:
71+
self.files.append(save_filename)
72+
else:
73+
# Duplicate file
74+
duplicate_files.append(required_file)
75+
76+
for optional_file in optional_files:
77+
file_count = 0
78+
for filename in geojson_files:
79+
base_name = os.path.basename(filename)
3880
if optional_file in base_name and base_name.endswith('.geojson'):
39-
self.files.append(filename)
40-
optional_files.remove(optional_file)
41-
break
81+
file_count += 1
82+
save_filename = filename
83+
if file_count == 1:
84+
self.files.append(save_filename)
85+
elif file_count > 1:
86+
# Duplicate file
87+
duplicate_files.append(optional_file)
4288

43-
if required_files:
44-
self.error = f'Missing required .geojson files: {", ".join(required_files)}.'
89+
if missing_files:
90+
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
91+
return False
92+
93+
if duplicate_files:
94+
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
4595
return False
96+
97+
# Add OSW external extensions, GeoJSON files we know nothing about
98+
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
4699

47100
return True

0 commit comments

Comments
 (0)