22import json
33import jsonschema
44from typing import Dict , Any , Optional , List
5+ import geopandas as gpd
56from .zipfile_handler import ZipFileHandler
6- from .extracted_data_validator import ExtractedDataValidator
7+ from .extracted_data_validator import ExtractedDataValidator , OSW_dataset_files
8+ from .version import __version__
79
810SCHEMA_PATH = os .path .join (os .path .dirname (__file__ ), 'schema' )
911
@@ -34,6 +36,14 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
3436 except Exception as e :
3537 self .errors .append (f'Invalid or missing schema file: { e } ' )
3638 raise Exception (f'Invalid or missing schema file: { e } ' )
39+
40+ def are_ids_unique (self , gdf ):
41+ """Check for duplicate values in the _id field"""
42+ duplicates = gdf [gdf .duplicated ('_id' , keep = False )]['_id' ].unique ()
43+
44+ is_valid = len (duplicates ) == 0
45+
46+ return is_valid , list (duplicates )
3747
3848 def validate (self ) -> ValidationResult :
3949 try :
@@ -50,15 +60,84 @@ def validate(self) -> ValidationResult:
5060 if not validator .is_valid ():
5161 self .errors .append (validator .error )
5262 return ValidationResult (False , self .errors )
63+ for file in validator .files :
64+ file_path = os .path .join (file )
65+ is_valid = self .validate_osw_errors (file_path )
5366
67+ if self .errors :
68+ zip_handler .remove_extracted_files ()
69+ return ValidationResult (False , self .errors )
70+
71+ # Validate data integrity
72+ OSW_dataset = {}
5473 for file in validator .files :
5574 file_path = os .path .join (file )
56- is_valid = self .validate_osw_errors (self .load_osw_file (file_path ))
75+ osw_file = next ((osw_file_any for osw_file_any in OSW_dataset_files .keys () if osw_file_any in file_path ), '' )
76+ OSW_dataset [osw_file ] = gpd .read_file (file_path )
77+
78+ # Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
79+ for osw_file in OSW_dataset :
80+ is_valid , duplicates = self .are_ids_unique (OSW_dataset [osw_file ])
5781 if not is_valid :
58- zip_handler .remove_extracted_files ()
59- return ValidationResult (False , self .errors )
82+ self .errors .append (f"Duplicate _id's found in { osw_file } : { duplicates } " )
83+
84+ # Create sets of node id's and foreign keys to be used in validation
85+ if "nodes" in OSW_dataset :
86+ node_ids = set (OSW_dataset ['nodes' ]['_id' ])
87+ else :
88+ node_ids = set ()
89+
90+ if "edges" in OSW_dataset :
91+ node_ids_edges_u = set (OSW_dataset ['edges' ]['_u_id' ])
92+ node_ids_edges_v = set (OSW_dataset ['edges' ]['_v_id' ])
93+ else :
94+ node_ids_edges_u = set ()
95+ node_ids_edges_v = set ()
96+
97+ if "zones" in OSW_dataset :
98+ node_ids_zones_w = set ([item for sublist in OSW_dataset ['zones' ]['_w_id' ] for item in sublist ])
99+ else :
100+ node_ids_zones_w = set ()
101+
102+ # Do all node references in _u_id exist in nodes?
103+ unmatched = node_ids_edges_u - node_ids
104+ is_valid = len (unmatched ) == 0
105+ if not is_valid :
106+ self .errors .append (f"All _u_id's in edges should be part of _id's mentioned in nodes, _u_id's not in nodes are: { unmatched } " )
60107
61- return ValidationResult (True )
108+ # Do all node references in _v_id exist in nodes?
109+ unmatched = node_ids_edges_v - node_ids
110+ is_valid = len (unmatched ) == 0
111+ if not is_valid :
112+ self .errors .append (f"All _v_id's in edges should be part of _id's mentioned in nodes, _v_id's not in nodes are: { unmatched } " )
113+
114+ # Do all node references in _w_id exist in nodes?
115+ unmatched = node_ids_zones_w - node_ids
116+ is_valid = len (unmatched ) == 0
117+ if not is_valid :
118+ self .errors .append (f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: { unmatched } " )
119+
120+ # Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
121+ for osw_file in OSW_dataset :
122+ invalid_geojson = OSW_dataset [osw_file ][(OSW_dataset [osw_file ].geometry .type != OSW_dataset_files [osw_file ]['geometry' ]) | (OSW_dataset [osw_file ].is_valid == False )]
123+ is_valid = len (invalid_geojson ) == 0
124+ if not is_valid :
125+ self .errors .append (f"Invalid { osw_file } geometries found, id's of invalid geometries: { set (invalid_geojson ['_id' ])} " )
126+
127+ # Validate OSW external extensions
128+ for file in validator .externalExtensions :
129+ file_path = os .path .join (file )
130+ extensionFile = gpd .read_file (file_path )
131+ invalid_geojson = extensionFile [extensionFile .is_valid == False ]
132+ is_valid = len (invalid_geojson ) == 0
133+ if not is_valid :
134+ self .errors .append (f"Invalid geometries found in extension file { file } , list of invalid geometries: { invalid_geojson .to_json ()} " )
135+
136+ if self .errors :
137+ zip_handler .remove_extracted_files ()
138+ return ValidationResult (False , self .errors )
139+ else :
140+ return ValidationResult (True )
62141 except Exception as e :
63142 self .errors .append (f'Unable to validate: { e } ' )
64143 return ValidationResult (False , self .errors )
@@ -68,8 +147,9 @@ def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
68147 with open (graph_geojson_path , 'r' ) as file :
69148 return json .load (file )
70149
71- def validate_osw_errors (self , geojson_data : Dict [ str , Any ] ) -> bool :
150+ def validate_osw_errors (self , file_path : str ) -> bool :
72151 '''Validate OSW Data against the schema and process all errors'''
152+ geojson_data = self .load_osw_file (file_path )
73153 validator = jsonschema .Draft7Validator (self .load_osw_schema (self .schema_file_path ))
74154 errors = list (validator .iter_errors (geojson_data ))
75155
0 commit comments