Skip to content

Commit 606c841

Browse files
authored
Merge pull request #22 from TaskarCenterAtUW/feature-cleanup-files
Added Garbage Collection
2 parents b4de641 + bd1ef92 commit 606c841

File tree

6 files changed

+77
-75
lines changed

6 files changed

+77
-75
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
# Change log
22

3+
### 0.2.6
4+
- Add garbage collection to free up memory after validation
5+
6+
37
### 0.2.5
4-
- Updated geopandas package
8+
- Updated geopandas package
9+
510

611
### 0.2.3
712
- Performance improvement if there are any errors

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
jsonschema~=4.19.1
2-
zipfile36~=0.1.3
3-
coverage~=7.5.1
1+
jsonschema
2+
zipfile36
3+
coverage
44
geopandas

setup.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,5 @@
1-
import os
2-
from setuptools import setup, find_packages, Extension
31
from version import version
4-
5-
project_path = os.path.dirname(os.path.realpath(__file__))
6-
requirements_file = '{}/requirements.txt'.format(project_path)
7-
8-
with open(requirements_file) as f:
9-
content = f.readlines()
10-
install_requires = [x.strip() for x in content]
2+
from setuptools import setup, find_packages
113

124
with open('README.md', 'r') as fh:
135
long_description = fh.read()
@@ -26,7 +18,11 @@
2618
},
2719
long_description_content_type='text/markdown',
2820
url='https://github.com/TaskarCenterAtUW/TDEI-python-lib-osw-validation',
29-
install_requires=install_requires,
21+
install_requires=[
22+
'jsonschema',
23+
'zipfile36',
24+
'geopandas'
25+
],
3026
packages=find_packages(where='src'),
3127
classifiers=[
3228
'Programming Language :: Python :: 3',
@@ -38,4 +34,4 @@
3834
package_data={
3935
'python_osw_validation': ['schema/*'],
4036
},
41-
)
37+
)

src/python_osw_validation/__init__.py

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import os
2+
import gc
23
import json
34
import jsonschema
45
from typing import Dict, Any, Optional, List
56
import geopandas as gpd
67
from .zipfile_handler import ZipFileHandler
7-
from .extracted_data_validator import ExtractedDataValidator, OSW_dataset_files
8+
from .extracted_data_validator import ExtractedDataValidator, OSW_DATASET_FILES
89
from .version import __version__
910

1011
SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
@@ -29,7 +30,7 @@ def __init__(self, zipfile_path: str, schema_file_path=None):
2930
self.schema_file_path = schema_file_path
3031

3132
def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
32-
'''Load OSW Schema'''
33+
"""Load OSW Schema"""
3334
try:
3435
with open(schema_path, 'r') as file:
3536
return json.load(file)
@@ -40,12 +41,12 @@ def load_osw_schema(self, schema_path: str) -> Dict[str, Any]:
4041
def are_ids_unique(self, gdf):
4142
"""Check for duplicate values in the _id field"""
4243
duplicates = gdf[gdf.duplicated('_id', keep=False)]['_id'].unique()
43-
4444
is_valid = len(duplicates) == 0
45-
4645
return is_valid, list(duplicates)
4746

4847
def validate(self, max_errors=20) -> ValidationResult:
48+
zip_handler = None
49+
OSW_DATASET = {}
4950
try:
5051
# Extract the zipfile
5152
zip_handler = ZipFileHandler(self.zipfile_path)
@@ -60,43 +61,42 @@ def validate(self, max_errors=20) -> ValidationResult:
6061
if not validator.is_valid():
6162
self.errors.append(validator.error)
6263
return ValidationResult(False, self.errors)
64+
6365
for file in validator.files:
6466
file_path = os.path.join(file)
65-
if not self.validate_osw_errors(file_path, max_errors):
67+
if not self.validate_osw_errors(file_path=str(file_path), max_errors=max_errors):
6668
break
6769

6870
if self.errors:
69-
zip_handler.remove_extracted_files()
7071
return ValidationResult(False, self.errors)
7172

7273
# Validate data integrity
73-
OSW_dataset = {}
7474
for file in validator.files:
7575
file_path = os.path.join(file)
76-
osw_file = next((osw_file_any for osw_file_any in OSW_dataset_files.keys() if osw_file_any in file_path), '')
77-
OSW_dataset[osw_file] = gpd.read_file(file_path)
76+
osw_file = next((osw_file_any for osw_file_any in OSW_DATASET_FILES.keys() if osw_file_any in file_path), '')
77+
OSW_DATASET[osw_file] = gpd.read_file(file_path)
7878

7979
# Are all id's unique in each file? No need to check uniqueness across files yet since we do not have a global OSW ID format yet
80-
for osw_file in OSW_dataset:
81-
is_valid, duplicates = self.are_ids_unique(OSW_dataset[osw_file])
80+
for osw_file in OSW_DATASET:
81+
is_valid, duplicates = self.are_ids_unique(OSW_DATASET[osw_file])
8282
if not is_valid:
8383
self.errors.append(f"Duplicate _id's found in {osw_file} : {duplicates}")
8484

8585
# Create sets of node id's and foreign keys to be used in validation
86-
if "nodes" in OSW_dataset:
87-
node_ids = set(OSW_dataset['nodes']['_id'])
86+
if 'nodes' in OSW_DATASET:
87+
node_ids = set(OSW_DATASET['nodes']['_id'])
8888
else:
8989
node_ids = set()
9090

91-
if "edges" in OSW_dataset:
92-
node_ids_edges_u = set(OSW_dataset['edges']['_u_id'])
93-
node_ids_edges_v = set(OSW_dataset['edges']['_v_id'])
91+
if 'edges' in OSW_DATASET:
92+
node_ids_edges_u = set(OSW_DATASET['edges']['_u_id'])
93+
node_ids_edges_v = set(OSW_DATASET['edges']['_v_id'])
9494
else:
9595
node_ids_edges_u = set()
9696
node_ids_edges_v = set()
9797

98-
if "zones" in OSW_dataset:
99-
node_ids_zones_w = set([item for sublist in OSW_dataset['zones']['_w_id'] for item in sublist])
98+
if 'zones' in OSW_DATASET:
99+
node_ids_zones_w = set([item for sublist in OSW_DATASET['zones']['_w_id'] for item in sublist])
100100
else:
101101
node_ids_zones_w = set()
102102

@@ -119,8 +119,8 @@ def validate(self, max_errors=20) -> ValidationResult:
119119
self.errors.append(f"All _w_id's in zones should be part of _id's mentioned in nodes, _w_id's not in nodes are: {unmatched}")
120120

121121
# Geometry validation: check geometry type in each file and test if coordinates make a shape that is reasonable geometric shape according to the Simple Feature Access standard
122-
for osw_file in OSW_dataset:
123-
invalid_geojson = OSW_dataset[osw_file][(OSW_dataset[osw_file].geometry.type != OSW_dataset_files[osw_file]['geometry']) | (OSW_dataset[osw_file].is_valid == False)]
122+
for osw_file in OSW_DATASET:
123+
invalid_geojson = OSW_DATASET[osw_file][(OSW_DATASET[osw_file].geometry.type != OSW_DATASET_FILES[osw_file]['geometry']) | (OSW_DATASET[osw_file].is_valid == False)]
124124
is_valid = len(invalid_geojson) == 0
125125
if not is_valid:
126126
self.errors.append(f"Invalid {osw_file} geometries found, id's of invalid geometries: {set(invalid_geojson['_id'])}")
@@ -135,30 +135,31 @@ def validate(self, max_errors=20) -> ValidationResult:
135135
self.errors.append(f"Invalid geometries found in extension file {file}, list of invalid geometries: {invalid_geojson.to_json()}")
136136

137137
if self.errors:
138-
zip_handler.remove_extracted_files()
139138
return ValidationResult(False, self.errors)
140139
else:
141140
return ValidationResult(True)
142141
except Exception as e:
143142
self.errors.append(f'Unable to validate: {e}')
144143
return ValidationResult(False, self.errors)
144+
finally:
145+
del OSW_DATASET
146+
if zip_handler:
147+
zip_handler.remove_extracted_files()
148+
gc.collect()
145149

146150
def load_osw_file(self, graph_geojson_path: str) -> Dict[str, Any]:
147-
'''Load OSW Data'''
151+
"""Load OSW Data"""
148152
with open(graph_geojson_path, 'r') as file:
149153
return json.load(file)
150154

151155
def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
152-
'''Validate OSW Data against the schema and process all errors'''
156+
"""Validate OSW Data against the schema and process all errors"""
153157
geojson_data = self.load_osw_file(file_path)
154158
validator = jsonschema.Draft7Validator(self.load_osw_schema(self.schema_file_path))
155159

156160
for error in validator.iter_errors(geojson_data):
157161
self.errors.append(f'Validation error: {error.message}')
158-
if len(self.errors) == max_errors:
159-
break
160-
161-
if len(self.errors) >= max_errors:
162-
return False
162+
if len(self.errors) >= max_errors:
163+
return False
163164

164-
return True
165+
return len(self.errors) < max_errors

src/python_osw_validation/extracted_data_validator.py

Lines changed: 30 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,32 @@
11
import os
22
import glob
33

4-
5-
OSW_dataset_files = {"edges": {
6-
"required": False,
7-
"geometry": "LineString"
8-
},
9-
"nodes": {
10-
"required": False,
11-
"geometry": "Point"
12-
},
13-
"points": {
14-
"required": False,
15-
"geometry": "Point"
16-
},
17-
"lines": {
18-
"required": False,
19-
"geometry": "LineString"
20-
},
21-
"zones": {
22-
"required": False,
23-
"geometry": "Polygon"
24-
},
25-
"polygons": {
26-
"required": False,
27-
"geometry": "Polygon"
28-
}
29-
}
4+
OSW_DATASET_FILES = {
5+
"edges": {
6+
"required": False,
7+
"geometry": "LineString"
8+
},
9+
"nodes": {
10+
"required": False,
11+
"geometry": "Point"
12+
},
13+
"points": {
14+
"required": False,
15+
"geometry": "Point"
16+
},
17+
"lines": {
18+
"required": False,
19+
"geometry": "LineString"
20+
},
21+
"zones": {
22+
"required": False,
23+
"geometry": "Polygon"
24+
},
25+
"polygons": {
26+
"required": False,
27+
"geometry": "Polygon"
28+
}
29+
}
3030

3131

3232
class ExtractedDataValidator:
@@ -53,8 +53,8 @@ def is_valid(self) -> bool:
5353
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
5454
return False
5555

56-
required_files = [key for key, value in OSW_dataset_files.items() if value['required']]
57-
optional_files = [key for key, value in OSW_dataset_files.items() if not value['required']]
56+
required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']]
57+
optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
5858
missing_files = []
5959
duplicate_files = []
6060
for required_file in required_files:
@@ -89,11 +89,11 @@ def is_valid(self) -> bool:
8989
if missing_files:
9090
self.error = f'Missing required .geojson files: {", ".join(missing_files)}.'
9191
return False
92-
92+
9393
if duplicate_files:
9494
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_files)}.'
9595
return False
96-
96+
9797
# Add OSW external extensions, GeoJSON files we know nothing about
9898
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
9999

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.2.5'
1+
__version__ = '0.2.6'

0 commit comments

Comments
 (0)