Skip to content

Commit 7ec2aa8

Browse files
authored
Merge pull request #47 from TaskarCenterAtUW/develop
[0.3.0] Deployement
2 parents 9a2f65c + 7969e69 commit 7ec2aa8

20 files changed

+10449
-8299
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Change log
22

3+
### 0.3.0
4+
- Default to OSW 0.3 dataset-specific schemas (edges, lines, nodes, points, polygons, zones) with filename-driven selection; removed legacy monolithic/geometry schema files.
5+
- Enforce the six canonical OSW 0.3 filenames inside datasets; reject non-standard names and detect duplicates/missing required files (with new unit tests).
6+
- Validation now ignores `$schema` hints and does not fall back to geometry typing; line schema is the final fallback when filenames give no hint.
7+
- Expanded test coverage for extension read failures, invalid extension ID extraction, `_w_id` missing in zones, cleanup edge cases, and required-file detection.
8+
39
### 0.2.15
410
- Update the base schema to make the $schema key is required
511
- Added unit test cases for that

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This package validates the OSW geojson file. Package requires a OSW zip file pat
1212

1313
- It unzip the provided zip files
1414
- Check for the required nodes and edges geojson files inside the unzipped folder
15-
- Validate each file (nodes, edges and points) against schema, schema can be found here
15+
- Validate each file (edges, lines, nodes, points, polygons and zones) against the matching schema (0.3 defaults live in `src/python_osw_validation/schema`)
1616
- Return true or false according to validation
1717
- you can check the error if it returned false.
1818

@@ -127,4 +127,4 @@ To use the library locally, use the [example.py](./src/example.py) code
127127
- Choose `main` branch for release
128128
- Publish the release.
129129
- This release triggers a workflow to generate the new version of the Package.
130-
- The new package will be available at https://pypi.org/project/python-osw-validation/
130+
- The new package will be available at https://pypi.org/project/python-osw-validation/

src/example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
INVALID_ZIP_FILE = os.path.join(ASSETS_DIR, '4151.zip')
88
INVALID_VANCOUVER_ZIP_FILE = os.path.join(ASSETS_DIR, 'vancouver-dataset.zip')
99
SCHEMA_DIR = os.path.join(PARENT_DIR, 'src/python_osw_validation/schema')
10-
SCHEMA_FILE_PATH = os.path.join(SCHEMA_DIR, 'opensidewalks.schema.json')
10+
SCHEMA_FILE_PATH = os.path.join(SCHEMA_DIR, 'opensidewalks.schema-0.3.json')
1111

1212

1313
def valid_test_with_provided_schema():

src/python_osw_validation/__init__.py

Lines changed: 77 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@
1212
from .helpers import _feature_index_from_error, _pretty_message, _rank_for
1313

1414
SCHEMA_PATH = os.path.join(os.path.dirname(__file__), 'schema')
15+
DEFAULT_DATASET_SCHEMAS = {
16+
"edges": os.path.join(SCHEMA_PATH, 'opensidewalks.edges.schema-0.3.json'),
17+
"lines": os.path.join(SCHEMA_PATH, 'opensidewalks.lines.schema-0.3.json'),
18+
"nodes": os.path.join(SCHEMA_PATH, 'opensidewalks.nodes.schema-0.3.json'),
19+
"points": os.path.join(SCHEMA_PATH, 'opensidewalks.points.schema-0.3.json'),
20+
"polygons": os.path.join(SCHEMA_PATH, 'opensidewalks.polygons.schema-0.3.json'),
21+
"zones": os.path.join(SCHEMA_PATH, 'opensidewalks.zones.schema-0.3.json'),
22+
}
1523

1624

1725
class ValidationResult:
@@ -33,17 +41,18 @@ def __init__(self, is_valid: bool, errors: Optional[List[str]] = None,
3341

3442

3543
class OSWValidation:
36-
default_schema_file_path = os.path.join(SCHEMA_PATH, 'opensidewalks.schema.json')
44+
default_schema_file_path_03 = os.path.join(SCHEMA_PATH, 'opensidewalks.schema-0.3.json')
3745

3846
# per-geometry defaults
39-
default_point_schema = os.path.join(SCHEMA_PATH, 'Point_schema.json')
40-
default_line_schema = os.path.join(SCHEMA_PATH, 'Linestring_schema.json')
41-
default_polygon_schema = os.path.join(SCHEMA_PATH, 'Polygon_schema.json')
47+
default_point_schema = DEFAULT_DATASET_SCHEMAS['points']
48+
default_line_schema = DEFAULT_DATASET_SCHEMAS['edges']
49+
default_polygon_schema = DEFAULT_DATASET_SCHEMAS['zones']
4250

4351
def __init__(
4452
self,
4553
zipfile_path: str,
4654
schema_file_path=None,
55+
schema_paths: Optional[Dict[str, str]] = None,
4756
point_schema_path: Optional[str] = None,
4857
line_schema_path: Optional[str] = None,
4958
polygon_schema_path: Optional[str] = None,
@@ -57,10 +66,15 @@ def __init__(
5766
# Legacy single schema (if set, used for all)
5867
self.schema_file_path = schema_file_path # may be None
5968

69+
# Dataset-specific schemas (override via schema_paths)
70+
self.dataset_schema_paths = {**DEFAULT_DATASET_SCHEMAS}
71+
if schema_paths:
72+
self.dataset_schema_paths.update({k: v for k, v in schema_paths.items() if v})
73+
6074
# Per-geometry schemas (with defaults)
61-
self.point_schema_path = point_schema_path or self.default_point_schema
62-
self.line_schema_path = line_schema_path or self.default_line_schema
63-
self.polygon_schema_path = polygon_schema_path or self.default_polygon_schema
75+
self.point_schema_path = point_schema_path or self.dataset_schema_paths['points']
76+
self.line_schema_path = line_schema_path or self.dataset_schema_paths['edges']
77+
self.polygon_schema_path = polygon_schema_path or self.dataset_schema_paths['zones']
6478

6579
# ----------------------------
6680
# Utilities & helpers
@@ -92,6 +106,45 @@ def _get_colset(self, gdf: Optional[gpd.GeoDataFrame], col: str, filekey: str) -
92106
self.log_errors(f"Could not create set for column '{col}' in {filekey}.", filekey, None)
93107
return set()
94108

109+
def _schema_key_from_text(self, text: Optional[str]) -> Optional[str]:
110+
"""Return dataset key (edges/nodes/points/lines/polygons/zones) if mentioned in text."""
111+
if not text:
112+
return None
113+
lower = text.lower()
114+
aliases = {
115+
"edges": ("edge", "edges"),
116+
"lines": ("line", "lines", "linestring"),
117+
"nodes": ("node", "nodes"),
118+
"points": ("point", "points"),
119+
"polygons": ("polygon", "polygons", "area"),
120+
"zones": ("zone", "zones"),
121+
}
122+
for key, variants in aliases.items():
123+
if any(alias in lower for alias in variants):
124+
return key
125+
return None
126+
127+
def _contains_disallowed_features_for_02(self, geojson_data: Dict[str, Any]) -> bool:
128+
"""Detect Tree coverage or Custom Point/Line/Polygon in legacy 0.2 datasets."""
129+
for feat in geojson_data.get("features", []):
130+
props = feat.get("properties") or {}
131+
val = props.get("natural")
132+
if isinstance(val, str) and val.strip().lower() in {"tree", "wood"}:
133+
return True
134+
if any(k in props for k in ("leaf_cycle", "leaf_type")):
135+
return True
136+
for k, v in props.items():
137+
target = ""
138+
if isinstance(v, str):
139+
target = v.lower()
140+
elif isinstance(k, str):
141+
target = k.lower()
142+
if any(tok in target for tok in ["custom point", "custom_point", "custompoint",
143+
"custom line", "custom_line", "customline",
144+
"custom polygon", "custom_polygon", "custompolygon"]):
145+
return True
146+
return False
147+
95148
# ----------------------------
96149
# Schema selection
97150
# ----------------------------
@@ -118,25 +171,12 @@ def are_ids_unique(self, gdf):
118171
def pick_schema_for_file(self, file_path: str, geojson_data: Dict[str, Any]) -> str:
119172
if self.schema_file_path:
120173
return self.schema_file_path
121-
try:
122-
features = geojson_data.get('features', [])
123-
if features:
124-
gtype = (features[0].get('geometry') or {}).get('type')
125-
if gtype == 'Point':
126-
return self.point_schema_path
127-
if gtype == 'LineString':
128-
return self.line_schema_path
129-
if gtype == 'Polygon':
130-
return self.polygon_schema_path
131-
except Exception:
132-
pass
133-
lower = os.path.basename(file_path).lower()
134-
if 'node' in lower or 'point' in lower:
135-
return self.point_schema_path
136-
if 'edge' in lower or 'line' in lower:
137-
return self.line_schema_path
138-
if 'zone' in lower or 'polygon' in lower or 'area' in lower:
139-
return self.polygon_schema_path
174+
175+
basename = os.path.basename(file_path)
176+
schema_key = self._schema_key_from_text(basename)
177+
if schema_key and schema_key in self.dataset_schema_paths:
178+
return self.dataset_schema_paths[schema_key]
179+
140180
return self.line_schema_path
141181

142182
# ----------------------------
@@ -432,6 +472,17 @@ def validate_osw_errors(self, file_path: str, max_errors: int) -> bool:
432472
return False
433473
except OSError:
434474
return False
475+
476+
schema_url = geojson_data.get('$schema')
477+
if isinstance(schema_url, str) and '0.2/schema.json' in schema_url:
478+
if self._contains_disallowed_features_for_02(geojson_data):
479+
self.log_errors(
480+
message="0.2 schema does not support Tree coverage, Custom Point, Custom Line, and Custom Polygon",
481+
filename=os.path.basename(file_path),
482+
feature_index=None,
483+
)
484+
return False
485+
435486
schema_path = self.pick_schema_for_file(file_path, geojson_data)
436487
schema = self.load_osw_schema(schema_path)
437488
validator = jsonschema_rs.Draft7Validator(schema)

src/python_osw_validation/extracted_data_validator.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,24 @@
2929
}
3030
}
3131

32+
ALLOWED_OSW_03_FILENAMES = (
33+
"opensidewalks.edges.geojson",
34+
"opensidewalks.lines.geojson",
35+
"opensidewalks.nodes.geojson",
36+
"opensidewalks.points.geojson",
37+
"opensidewalks.polygons.geojson",
38+
"opensidewalks.zones.geojson",
39+
)
40+
41+
_FILENAME_TO_KEY = {
42+
"opensidewalks.edges.geojson": "edges",
43+
"opensidewalks.lines.geojson": "lines",
44+
"opensidewalks.nodes.geojson": "nodes",
45+
"opensidewalks.points.geojson": "points",
46+
"opensidewalks.polygons.geojson": "polygons",
47+
"opensidewalks.zones.geojson": "zones",
48+
}
49+
3250

3351
class ExtractedDataValidator:
3452
def __init__(self, extracted_dir: str):
@@ -45,15 +63,41 @@ def is_valid(self) -> bool:
4563

4664
# Look for required files at the root level
4765
geojson_files = glob.glob(os.path.join(self.extracted_dir, '*.geojson'))
48-
49-
# If not found at the root, check inside folders
50-
if not geojson_files:
51-
geojson_files = glob.glob(os.path.join(self.extracted_dir, '*', '*.geojson'))
66+
nested_files = glob.glob(os.path.join(self.extracted_dir, '*', '*.geojson'))
67+
for f in nested_files:
68+
if f not in geojson_files:
69+
geojson_files.append(f)
5270

5371
if not geojson_files:
5472
self.error = 'No .geojson files found in the specified directory or its subdirectories.'
5573
return False
5674

75+
basenames = [os.path.basename(f) for f in geojson_files]
76+
is_osw_03 = any(name.startswith("opensidewalks.") for name in basenames)
77+
78+
if is_osw_03:
79+
invalid_basenames = [bn for bn in basenames if bn not in ALLOWED_OSW_03_FILENAMES]
80+
if invalid_basenames:
81+
allowed_fmt = ", ".join(ALLOWED_OSW_03_FILENAMES)
82+
self.error = f'Dataset contains non-standard file names. The only allowed file names are {{{allowed_fmt}}}'
83+
return False
84+
85+
duplicate_keys = []
86+
for filename in ALLOWED_OSW_03_FILENAMES:
87+
occurrences = [f for f in geojson_files if os.path.basename(f) == filename]
88+
if len(occurrences) > 1:
89+
duplicate_keys.append(_FILENAME_TO_KEY.get(filename, filename))
90+
elif len(occurrences) == 1:
91+
self.files.append(occurrences[0])
92+
93+
if duplicate_keys:
94+
self.error = f'Multiple .geojson files of the same type found: {", ".join(duplicate_keys)}.'
95+
return False
96+
97+
self.externalExtensions.extend([item for item in geojson_files if item not in self.files])
98+
gc.collect()
99+
return True
100+
57101
required_files = [key for key, value in OSW_DATASET_FILES.items() if value['required']]
58102
optional_files = [key for key, value in OSW_DATASET_FILES.items() if not value['required']]
59103
missing_files = []
@@ -106,7 +150,7 @@ def is_valid(self) -> bool:
106150

107151
finally:
108152
# Cleanup large lists and call garbage collector
109-
del geojson_files, required_files, optional_files, missing_files, duplicate_files
153+
del geojson_files, basenames, required_files, optional_files, missing_files, duplicate_files
110154
gc.collect()
111155

112156
return True

0 commit comments

Comments
 (0)