11import os
2+ import gc
23import json
34import jsonschema
45from typing import Dict , Any , Optional , List
56import geopandas as gpd
67from .zipfile_handler import ZipFileHandler
7- from .extracted_data_validator import ExtractedDataValidator , OSW_dataset_files
8+ from .extracted_data_validator import ExtractedDataValidator , OSW_DATASET_FILES
89from .version import __version__
910
1011SCHEMA_PATH = os .path .join (os .path .dirname (__file__ ), 'schema' )
@@ -29,7 +30,7 @@ def __init__(self, zipfile_path: str, schema_file_path=None):
2930 self .schema_file_path = schema_file_path
3031
3132 def load_osw_schema (self , schema_path : str ) -> Dict [str , Any ]:
32- ''' Load OSW Schema'''
33+ """ Load OSW Schema"""
3334 try :
3435 with open (schema_path , 'r' ) as file :
3536 return json .load (file )
@@ -40,12 +41,12 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
4041 def are_ids_unique (self , gdf ):
4142 """Check for duplicate values in the _id field"""
4243 duplicates = gdf [gdf .duplicated ('_id' , keep = False )]['_id' ].unique ()
43-
4444 is_valid = len (duplicates ) == 0
45-
4645 return is_valid , list (duplicates )
4746
4847 def validate (self , max_errors = 20 ) -> ValidationResult :
48+ zip_handler = None
49+ OSW_DATASET = {}
4950 try :
5051 # Extract the zipfile
5152 zip_handler = ZipFileHandler (self .zipfile_path )
@@ -60,43 +61,42 @@ def validate(self, max_errors=20) -> ValidationResult:
6061 if not validator .is_valid ():
6162 self .errors .append (validator .error )
6263 return ValidationResult (False , self .errors )
64+
6365 for file in validator .files :
6466 file_path = os .path .join (file )
65- if not self .validate_osw_errors (file_path , max_errors ):
67+ if not self .validate_osw_errors (file_path = str ( file_path ), max_errors = max_errors ):
6668 break
6769
6870 if self .errors :
69- zip_handler .remove_extracted_files ()
7071 return ValidationResult (False , self .errors )
7172
7273 # Validate data integrity
73- OSW_dataset = {}
7474 for file in validator .files :
7575 file_path = os .path .join (file )
76- osw_file = next ((osw_file_any for osw_file_any in OSW_dataset_files .keys () if osw_file_any in file_path ), '' )
77- OSW_dataset [osw_file ] = gpd .read_file (file_path )
76+ osw_file = next ((osw_file_any for osw_file_any in OSW_DATASET_FILES .keys () if osw_file_any in file_path ), '' )
77+ OSW_DATASET [osw_file ] = gpd .read_file (file_path )
7878
7979 # Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
80- for osw_file in OSW_dataset :
81- is_valid , duplicates = self .are_ids_unique (OSW_dataset [osw_file ])
80+ for osw_file in OSW_DATASET :
81+ is_valid , duplicates = self .are_ids_unique (OSW_DATASET [osw_file ])
8282 if not is_valid :
8383 self .errors .append (f"Duplicate _id's found in { osw_file } : { duplicates } " )
8484
8585 # Create sets of node id's and foreign keys to be used in validation
86- if " nodes" in OSW_dataset :
87- node_ids = set (OSW_dataset ['nodes' ]['_id' ])
86+ if ' nodes' in OSW_DATASET :
87+ node_ids = set (OSW_DATASET ['nodes' ]['_id' ])
8888 else :
8989 node_ids = set ()
9090
91- if " edges" in OSW_dataset :
92- node_ids_edges_u = set (OSW_dataset ['edges' ]['_u_id' ])
93- node_ids_edges_v = set (OSW_dataset ['edges' ]['_v_id' ])
91+ if ' edges' in OSW_DATASET :
92+ node_ids_edges_u = set (OSW_DATASET ['edges' ]['_u_id' ])
93+ node_ids_edges_v = set (OSW_DATASET ['edges' ]['_v_id' ])
9494 else :
9595 node_ids_edges_u = set ()
9696 node_ids_edges_v = set ()
9797
98- if " zones" in OSW_dataset :
99- node_ids_zones_w = set ([item for sublist in OSW_dataset ['zones' ]['_w_id' ] for item in sublist ])
98+ if ' zones' in OSW_DATASET :
99+ node_ids_zones_w = set ([item for sublist in OSW_DATASET ['zones' ]['_w_id' ] for item in sublist ])
100100 else :
101101 node_ids_zones_w = set ()
102102
@@ -119,8 +119,8 @@ def validate(self, max_errors=20) -> ValidationResult:
119119 self .errors .append (f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: { unmatched } " )
120120
121121 # Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
122- for osw_file in OSW_dataset :
123- invalid_geojson = OSW_dataset [osw_file ][(OSW_dataset [osw_file ].geometry .type != OSW_dataset_files [osw_file ]['geometry' ]) | (OSW_dataset [osw_file ].is_valid == False )]
122+ for osw_file in OSW_DATASET :
123+ invalid_geojson = OSW_DATASET [osw_file ][(OSW_DATASET [osw_file ].geometry .type != OSW_DATASET_FILES [osw_file ]['geometry' ]) | (OSW_DATASET [osw_file ].is_valid == False )]
124124 is_valid = len (invalid_geojson ) == 0
125125 if not is_valid :
126126 self .errors .append (f"Invalid { osw_file } geometries found, id's of invalid geometries: { set (invalid_geojson ['_id' ])} " )
@@ -135,30 +135,31 @@ def validate(self, max_errors=20) -> ValidationResult:
135135 self .errors .append (f"Invalid geometries found in extension file { file } , list of invalid geometries: { invalid_geojson .to_json ()} " )
136136
137137 if self .errors :
138- zip_handler .remove_extracted_files ()
139138 return ValidationResult (False , self .errors )
140139 else :
141140 return ValidationResult (True )
142141 except Exception as e :
143142 self .errors .append (f'Unable to validate: { e } ' )
144143 return ValidationResult (False , self .errors )
144+ finally :
145+ del OSW_DATASET
146+ if zip_handler :
147+ zip_handler .remove_extracted_files ()
148+ gc .collect ()
145149
146150 def load_osw_file (self , graph_geojson_path : str ) -> Dict [str , Any ]:
147- ''' Load OSW Data'''
151+ """ Load OSW Data"""
148152 with open (graph_geojson_path , 'r' ) as file :
149153 return json .load (file )
150154
151155 def validate_osw_errors (self , file_path : str , max_errors : int ) -> bool :
152- ''' Validate OSW Data against the schema and process all errors'''
156+ """ Validate OSW Data against the schema and process all errors"""
153157 geojson_data = self .load_osw_file (file_path )
154158 validator = jsonschema .Draft7Validator (self .load_osw_schema (self .schema_file_path ))
155159
156160 for error in validator .iter_errors (geojson_data ):
157161 self .errors .append (f'Validation error: { error .message } ' )
158- if len (self .errors ) == max_errors :
159- break
160-
161- if len (self .errors ) >= max_errors :
162- return False
162+ if len (self .errors ) >= max_errors :
163+ return False
163164
164- return True
165+ return len ( self . errors ) < max_errors
0 commit comments