Skip to content

Commit 172708b

Browse files
authored
Merge pull request #27 from TaskarCenterAtUW/develop
Develop to main
2 parents b55c12a + f2c47aa commit 172708b

File tree

6 files changed

+86
-49
lines changed

6 files changed

+86
-49
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Change log
22

3+
### 0.2.7
4+
- Switch to `jsonschema_rs` for performance enhancement, instead of `jsonschema` package
5+
- Refactor code for improve memory utilization
6+
- Added garbage collector
7+
8+
39
### 0.2.6
410
- Add garbage collection to free up memory after validation
511

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
jsonschema
1+
jsonschema_rs
22
zipfile36
33
coverage
44
geopandas

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
long_description_content_type='text/markdown',
2020
url='https://github.com/TaskarCenterAtUW/TDEI-python-lib-osw-validation',
2121
install_requires=[
22-
'jsonschema',
22+
'jsonschema_rs',
2323
'zipfile36',
2424
'geopandas'
2525
],

src/python_osw_validation/__init__.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import os
22
import gc
33
import json
4-
import jsonschema
5-
from typing import Dict, Any, Optional, List
4+
import jsonschema_rs
65
import geopandas as gpd
76
from .zipfile_handler import ZipFileHandler
7+
from typing import Dict, Any, Optional, List
88
from .extracted_data_validator import ExtractedDataValidator, OSW_DATASET_FILES
99
from .version import __version__
1010

@@ -37,7 +37,7 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
3737
except Exception as e:
3838
self.errors.append(f'Invalid or missing schema file: {e}')
3939
raise Exception(f'Invalid or missing schema file: {e}')
40-
40+
4141
def are_ids_unique(self, gdf):
4242
"""Check for duplicate values in the _id field"""
4343
duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique()
@@ -47,6 +47,7 @@ def are_ids_unique(self, gdf):
4747
def validate(self, max_errors=20) -> ValidationResult:
4848
zip_handler = None
4949
OSW_DATASET = {}
50+
validator = None
5051
try:
5152
# Extract the zipfile
5253
zip_handler = ZipFileHandler(self.zipfile_path)
@@ -73,7 +74,8 @@ def validate(self, max_errors=20) -> ValidationResult:
7374
# Validate data integrity
7475
for file in validator.files:
7576
file_path = os.path.join(file)
76-
osw_file = next((osw_file_any for osw_file_any in OSW_DATASET_FILES.keys() if osw_file_any in file_path), '')
77+
osw_file = next(
78+
(osw_file_any for osw_file_any in OSW_DATASET_FILES.keys() if osw_file_any in file_path), '')
7779
OSW_DATASET[osw_file] = gpd.read_file(file_path)
7880

7981
# Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
@@ -104,26 +106,32 @@ def validate(self, max_errors=20) -> ValidationResult:
104106
unmatched = node_ids_edges_u - node_ids
105107
is_valid = len(unmatched) == 0
106108
if not is_valid:
107-
self.errors.append(f"All _u_id's in edges should be part of _id's mentioned in nodes, _u_id's not in nodes are: {unmatched}")
109+
self.errors.append(
110+
f"All _u_id's in edges should be part of _id's mentioned in nodes, _u_id's not in nodes are: {unmatched}")
108111

109112
# Do all node references in _v_id exist in nodes?
110113
unmatched = node_ids_edges_v - node_ids
111114
is_valid = len(unmatched) == 0
112115
if not is_valid:
113-
self.errors.append(f"All _v_id's in edges should be part of _id's mentioned in nodes, _v_id's not in nodes are: {unmatched}")
116+
self.errors.append(
117+
f"All _v_id's in edges should be part of _id's mentioned in nodes, _v_id's not in nodes are: {unmatched}")
114118

115119
# Do all node references in _w_id exist in nodes?
116120
unmatched = node_ids_zones_w - node_ids
117121
is_valid = len(unmatched) == 0
118122
if not is_valid:
119-
self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")
123+
self.errors.append(
124+
f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")
120125

121126
# Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
122127
for osw_file in OSW_DATASET:
123-
invalid_geojson = OSW_DATASET[osw_file][(OSW_DATASET[osw_file].geometry.type != OSW_DATASET_FILES[osw_file]['geometry']) | (OSW_DATASET[osw_file].is_valid == False)]
128+
invalid_geojson = OSW_DATASET[osw_file][
129+
(OSW_DATASET[osw_file].geometry.type != OSW_DATASET_FILES[osw_file]['geometry']) | (
130+
OSW_DATASET[osw_file].is_valid == False)]
124131
is_valid = len(invalid_geojson) == 0
125132
if not is_valid:
126-
self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
133+
self.errors.append(
134+
f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
127135

128136
# Validate OSW external extensions
129137
for file in validator.externalExtensions:
@@ -132,7 +140,8 @@ def validate(self, max_errors=20) -> ValidationResult:
132140
invalid_geojson = extensionFile[extensionFile.is_valid == False]
133141
is_valid = len(invalid_geojson) == 0
134142
if not is_valid:
135-
self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")
143+
self.errors.append(
144+
f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")
136145

137146
if self.errors:
138147
return ValidationResult(False, self.errors)
@@ -145,6 +154,16 @@ def validate(self, max_errors=20) -> ValidationResult:
145154
del OSW_DATASET
146155
if zip_handler:
147156
zip_handler.remove_extracted_files()
157+
158+
# Force garbage collection to free memory
159+
gc.collect()
160+
161+
# Additional memory cleanup for geopandas dataframes
162+
if validator:
163+
for osw_file in validator.files:
164+
if osw_file in locals():
165+
del osw_file
166+
del validator
148167
gc.collect()
149168

150169
def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
@@ -155,7 +174,7 @@ def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
155174
def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
156175
"""Validate OSW Data against the schema and process all errors"""
157176
geojson_data = self.load_osw_file(file_path)
158-
validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path))
177+
validator = jsonschema_rs.Draft7Validator(self.load_osw_schema(self.schema_file_path))
159178

160179
for error in validator.iter_errors(geojson_data):
161180
self.errors.append(f'Validation error: {error.message}')

src/python_osw_validation/extracted_data_validator.py

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import gc
23
import glob
34

45
OSW_DATASET_FILES = {
@@ -57,44 +58,55 @@ def is_valid(self) -> bool:
5758
optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
5859
missing_files = []
5960
duplicate_files = []
60-
for required_file in required_files:
61-
file_count = 0
62-
for filename in geojson_files:
63-
base_name = os.path.basename(filename)
64-
if required_file in base_name and base_name.endswith('.geojson'):
65-
file_count += 1
66-
save_filename = filename
67-
if file_count == 0:
68-
# Missing required file
69-
missing_files.append(required_file)
70-
elif file_count == 1:
71-
self.files.append(save_filename)
72-
else:
73-
# Duplicate file
74-
duplicate_files.append(required_file)
61+
save_filename = None # Initialize this variable
7562

76-
for optional_file in optional_files:
77-
file_count = 0
78-
for filename in geojson_files:
79-
base_name = os.path.basename(filename)
80-
if optional_file in base_name and base_name.endswith('.geojson'):
81-
file_count += 1
82-
save_filename = filename
83-
if file_count == 1:
84-
self.files.append(save_filename)
85-
elif file_count > 1:
86-
# Duplicate file
87-
duplicate_files.append(optional_file)
63+
try:
64+
# Process required files
65+
for required_file in required_files:
66+
file_count = 0
67+
for filename in geojson_files:
68+
base_name = os.path.basename(filename)
69+
if required_file in base_name and base_name.endswith('.geojson'):
70+
file_count += 1
71+
save_filename = filename
72+
if file_count == 0:
73+
# Missing required file
74+
missing_files.append(required_file)
75+
elif file_count == 1:
76+
self.files.append(save_filename)
77+
else:
78+
# Duplicate file
79+
duplicate_files.append(required_file)
8880

89-
if missing_files:
90-
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
91-
return False
81+
# Process optional files
82+
for optional_file in optional_files:
83+
file_count = 0
84+
for filename in geojson_files:
85+
base_name = os.path.basename(filename)
86+
if optional_file in base_name and base_name.endswith('.geojson'):
87+
file_count += 1
88+
save_filename = filename
89+
if file_count == 1:
90+
self.files.append(save_filename)
91+
elif file_count > 1:
92+
# Duplicate file
93+
duplicate_files.append(optional_file)
9294

93-
if duplicate_files:
94-
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
95-
return False
95+
# Check for missing or duplicate files
96+
if missing_files:
97+
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
98+
return False
99+
100+
if duplicate_files:
101+
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
102+
return False
103+
104+
# Add OSW external extensions, GeoJSON files we know nothing about
105+
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
96106

97-
# Add OSW external extensions, GeoJSON files we know nothing about
98-
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
107+
finally:
108+
# Cleanup large lists and call garbage collector
109+
del geojson_files, required_files, optional_files, missing_files, duplicate_files
110+
gc.collect()
99111

100112
return True
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.2.6'
1+
__version__ = '0.2.7'

0 commit comments

Comments
 (0)