Merge pull request #7 from TaskarCenterAtUW/dev

sujata-m · web-flow · commit 199183d1ab68 · 2024-10-15T16:53:17.000+05:30
Dev to Main
diff --git a/.github/workflows/publish_to_test.yml b/.github/workflows/publish_to_test.yml
@@ -39,5 +39,5 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           skip_existing: true
-          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          password: ${{ secrets.PYPI_API_TOKEN }}
           repository_url: https://test.pypi.org/legacy/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+### 0.0.2
+- Fixed [Task-1347](https://dev.azure.com/TDEI-UW/TDEI/_workitems/edit/1347/).
+- Fixed package to removing the additional keys from the geojson files. 
+- Introduced garbage collection to free up memory.
+- Added ability to skip the tags which are already present in the edges file.
+- Added ability to process the incline tags in batch processing.
+
+
 ### 0.0.1
 - Introduces osw_inclination package which calculates the inclination of the sidewalk based on the DEM data.
 - Added example.py file which demonstrates how to use the package.
diff --git a/README.md b/README.md
@@ -47,6 +47,12 @@ osw_incline = OSWIncline(
 # Perform the incline calculation, it will add the incline to the  original edges file 
 result = osw_incline.calculate()
 
+# To skip the incline tags which are already present in the edges file
+result = osw_incline.calculate(skip_existing_tags=True)
+
+# To update the incline tags in batch processing (It might be faster than the normal calculation but increases the memory usage)
+result = osw_incline.calculate(batch_processing=True)
+
 if result:
     print("Incline calculation completed successfully.")
 ```
diff --git a/src/osw_incline/__init__.py b/src/osw_incline/__init__.py
@@ -1,3 +1,4 @@
+import gc
 import time
 from typing import List
 from pathlib import Path
@@ -6,6 +7,7 @@
 from .version import __version__
 from .dem_processor import DEMProcessor
 
+
 class OSWIncline:
     def __init__(self, dem_files: List[str], nodes_file: str, edges_file: str, debug=False):
         self.dem_files = dem_files
@@ -15,8 +17,7 @@ def __init__(self, dem_files: List[str], nodes_file: str, edges_file: str, debug
         if self.debug:
             Logger.debug('Debug mode is enabled')
 
-
-    def calculate(self):
+    def calculate(self, skip_existing_tags=False, batch_processing=False):
         try:
             if self.debug:
                 Logger.debug('Starting calculation process')
@@ -32,8 +33,16 @@ def calculate(self):
             dem_processor = DEMProcessor(osm_graph=osm_graph, dem_files=self.dem_files, debug=self.debug)
             dem_processor.process(
                 nodes_path=graph_nodes_path,
-                edges_path=graph_edges_path
+                edges_path=graph_edges_path,
+                skip_existing_tags=skip_existing_tags,
+                batch_processing=batch_processing
             )
+
+            # Delete osm_graph and dem_processor to force garbage collection
+            osm_graph.clean()
+            del osm_graph, dem_processor
+            gc.collect()
+
             end_time = time.time()
             time_taken = end_time - start_time
             if self.debug:
@@ -43,7 +52,8 @@ def calculate(self):
             if self.debug:
                 Logger.error(f'Error processing DEM files: {e}')
             raise Exception(f'Error processing DEM files: {e}')
+        finally:
+            gc.collect()
 
 
-
-OSWIncline.__version__ = __version__
+OSWIncline.__version__ = __version__
diff --git a/src/osw_incline/dem_processor.py b/src/osw_incline/dem_processor.py
@@ -1,3 +1,4 @@
+import gc
 import math
 import pyproj
 import rasterio
@@ -21,23 +22,50 @@ def __init__(self, osm_graph: OSMGraph, dem_files: List[str], debug=False):
         self.OG = osm_graph
         self.debug = debug
 
-    def process(self, nodes_path, edges_path):
+    def process(self, nodes_path, edges_path, skip_existing_tags=False, batch_processing=False):
+        gc.disable()
         for dem_file in self.dem_files:
             dem_file_path = Path(dem_file)
             if self.debug:
                 Logger.debug(f'Processing DEM tile: {dem_file_path}')
 
             try:
                 with rasterio.open(dem_file_path) as dem:
-                    for u, v, d in self.OG.G.edges(data=True):
-                        if 'geometry' in d:
-                            incline = self.infer_incline(linestring=d['geometry'], dem=dem, precision=3)
-                            if incline is not None:
-                                # Add incline to the edge properties
-                                d['incline'] = incline
-                        else:
-                            if self.debug:
-                                Logger.info(f'No geometry found for edge {u}-{v}')
+                    """
+                    Option 1:
+                        Pros:
+                            Batching: This approach processes edges in batches of 1000, which can be faster for large graphs.
+                            Parallelization: The second approach can be parallelized by using a ThreadPoolExecutor or similar.
+                        Cons:
+                            Memory usage: The second approach stores all edges in a list, which could be memory-intensive for large graphs.
+                            Intermediate list storage: The second approach stores the entire edge set as a list in memory, which is not memory-efficient.
+                    """
+                    if batch_processing:
+                        edges = list(self.OG.G.edges(data=True))  # Get all edges, even if fewer than batch_size
+                        self._process_in_batches(edges, dem, batch_size=1000, skip_existing_tags=skip_existing_tags)
+                    else:
+                        """
+                        Option 2:
+                            Pros:
+                                Simple iteration: The first approach iterates over the edges one by one, making the memory footprint relatively small, especially if you have a large number of edges.
+                                No intermediate list storage: It does not store the entire edge set as a list in memory, which is better for memory efficiency.
+                            Cons:
+                                Single-threaded: The entire edge processing happens sequentially, which can be slower for very large graphs, as there's no batching or parallelization.
+                                No batching: It processes all edges at once in a loop, which could cause memory spikes during large computations if infer_incline holds intermediate states or large datasets.
+                        """
+                        for u, v, d in self.OG.G.edges(data=True):
+                            if 'geometry' in d:
+                                if skip_existing_tags:
+                                    if 'incline' in d and d['incline'] is not None:
+                                        # If incline already exists, skip
+                                        continue
+                                incline = self.infer_incline(linestring=d['geometry'], dem=dem, precision=3)
+                                if incline is not None:
+                                    # Add incline to the edge properties
+                                    d['incline'] = incline
+                            else:
+                                if self.debug:
+                                    Logger.info(f'No geometry found for edge {u}-{v}')
 
                 self.OG.to_geojson(nodes_path, edges_path)
             except rasterio.errors.RasterioIOError:
@@ -48,6 +76,26 @@ def process(self, nodes_path, edges_path):
                 if self.debug:
                     Logger.error(f'Error processing DEM file: {dem_file_path}, error: {e}')
                 raise Exception(f'Error processing DEM file: {dem_file_path}, error: {e}')
+            finally:
+                gc.collect()
+
+        gc.disable()
+
+    def _process_in_batches(self, edges, dem, batch_size=1000, skip_existing_tags=False):
+        # Process edges in batches
+        for i in range(0, len(edges), batch_size):
+            batch = edges[i:i + batch_size]
+            for u, v, d in batch:
+                if 'geometry' in d:
+                    if skip_existing_tags:
+                        if 'incline' in d and d['incline'] is not None:
+                            # If incline already exists, skip
+                            continue
+                    incline = self.infer_incline(linestring=d['geometry'], dem=dem, precision=3)
+                    if incline is not None:
+                        d['incline'] = incline
+            # Trigger garbage collection after each batch
+            gc.collect()
 
     def infer_incline(self, linestring, dem, precision=3):
         first_point = linestring.coords[0]
@@ -154,6 +202,7 @@ def interpolated_value(self, x, y, dem, method='idw', scaling_factor=1.0):
 
         interpolated = interpolator(dx, dy, dem_arr)
 
+        del dem_arr
         if interpolated is None:
             return interpolated
         else:
@@ -187,6 +236,7 @@ def idw(self, dx, dy, masked_array):
 
         value = weighted_values.sum()
 
+        del xs, ys, values_masked, weighted_values
 
         if np.isnan(value):
             return None
@@ -210,4 +260,4 @@ def bivariate_spline(self, dx, dy, arr):
         spline = RectBivariateSpline(
             np.array(range(ncol)), np.array(range(nrow)), arr, kx=kx, ky=ky
         )
-        return spline(dx, dy)[0][0]
+        return spline(dx, dy)[0][0]
diff --git a/src/osw_incline/osm_graph.py b/src/osw_incline/osm_graph.py
@@ -1,8 +1,10 @@
+import gc
 import json
 import pyproj
 import networkx as nx
 from shapely.geometry import shape, mapping
 
+SCHEMA = 'https://sidewalks.washington.edu/opensidewalks/0.2/schema.json'
 
 class OSMGraph:
     def __init__(self, G=None):
@@ -29,13 +31,19 @@ def from_geojson(cls, nodes_path, edges_path):
             props['geometry'] = shape(node_feature['geometry'])
             G.add_node(n, **props)
 
+        del nodes_fc
+        gc.collect()
+
         for edge_feature in edges_fc['features']:
             props = edge_feature['properties']
             u = props.pop('_u_id')
             v = props.pop('_v_id')
             props['geometry'] = shape(edge_feature['geometry'])
             G.add_edge(u, v, **props)
 
+        del edges_fc
+        gc.collect()
+
         return osm_graph
 
     def to_geojson(self, *args):
@@ -58,7 +66,18 @@ def to_geojson(self, *args):
                 'geometry': geometry,
                 'properties': d_copy
             })
-        edges_fc = {'type': 'FeatureCollection', 'features': edge_features}
+        edges_fc = {
+            'type': 'FeatureCollection',
+            'features': edge_features,
+            '$schema': SCHEMA
+        }
+
+        with open(edges_path, 'w') as f:
+            json.dump(edges_fc, f)
+
+        # Delete edge_features and force garbage collection
+        del edge_features, edges_fc
+        gc.collect()
 
         node_features = []
         for n, d in self.G.nodes(data=True):
@@ -82,14 +101,19 @@ def to_geojson(self, *args):
                     'geometry': geometry,
                     'properties': d_copy
                 })
-        nodes_fc = {'type': 'FeatureCollection', 'features': node_features}
-
-        with open(edges_path, 'w') as f:
-            json.dump(edges_fc, f)
+        nodes_fc = {
+            'type': 'FeatureCollection',
+            'features': node_features,
+            '$schema': SCHEMA
+        }
 
         with open(nodes_path, 'w') as f:
             json.dump(nodes_fc, f)
 
+        # Delete node_features and force garbage collection
+        del node_features, nodes_fc
+        gc.collect()
+
         if len(args) == 3:
             points_path = args[2]
             point_features = []
@@ -116,7 +140,19 @@ def to_geojson(self, *args):
                         'geometry': geometry,
                         'properties': d_copy
                     })
-            points_fc = {'type': 'FeatureCollection', 'features': point_features}
+            points_fc = {
+                'type': 'FeatureCollection',
+                'features': point_features,
+                '$schema': SCHEMA
+            }
 
             with open(points_path, 'w') as f:
                 json.dump(points_fc, f)
+
+            # Delete point_features and force garbage collection
+            del point_features, points_fc
+            gc.collect()
+
+    def clean(self):
+        del self.G
+        gc.collect()
diff --git a/src/osw_incline/version.py b/src/osw_incline/version.py
@@ -1 +1 @@
-__version__ = '0.0.1'
+__version__ = '0.0.2'
diff --git a/tests/test_dem_processor.py b/tests/test_dem_processor.py
@@ -39,6 +39,28 @@ def test_process_success(self, mock_rasterio_open):
 
         self.osm_graph.to_geojson.assert_called_once_with('nodes.json', 'edges.json')
 
+    @patch('src.osw_incline.dem_processor.rasterio.open')
+    def test_process_success_with_batching(self, mock_rasterio_open):
+        mock_dem = MagicMock()
+        mock_rasterio_open.return_value.__enter__.return_value = mock_dem
+        self.osm_graph.G.edges.return_value = [('u', 'v', {'geometry': LineString([(0, 0), (1, 1)])})]
+
+        with patch.object(self.processor, 'infer_incline', return_value=0.1):
+            self.processor.process('nodes.json', 'edges.json', batch_processing=True)
+
+        self.osm_graph.to_geojson.assert_called_once_with('nodes.json', 'edges.json')
+
+    @patch('src.osw_incline.dem_processor.rasterio.open')
+    def test_process_success_with_skip(self, mock_rasterio_open):
+        mock_dem = MagicMock()
+        mock_rasterio_open.return_value.__enter__.return_value = mock_dem
+        self.osm_graph.G.edges.return_value = [('u', 'v', {'geometry': LineString([(0, 0), (1, 1)])})]
+
+        with patch.object(self.processor, 'infer_incline', return_value=0.1):
+            self.processor.process('nodes.json', 'edges.json', skip_existing_tags=True)
+
+        self.osm_graph.to_geojson.assert_called_once_with('nodes.json', 'edges.json')
+
     # Test processing when RasterioIOError is raised
     @patch('src.osw_incline.dem_processor.rasterio.open')
     def test_process_rasterio_io_error(self, mock_rasterio_open):
@@ -401,7 +423,6 @@ def test_interpolated_value_return_scaled(self, mock_rasterio_open):
 
         # Assert that the result is not None and that it is scaled properly
         self.assertIsNotNone(result, 'IDW interpolation should return a valid value')
-        print(f"IDW Interpolated Value (scaled): {result}")
 
         # You can adjust the expected result based on the IDW logic; here, just check that it's non-zero
         self.assertGreater(result, 0, 'The interpolated result should be greater than 0')
diff --git a/tests/test_osm_graph.py b/tests/test_osm_graph.py
@@ -256,6 +256,12 @@ def test_to_geojson_with_points_and_osm_id(self):
             nodes_data = json.load(f)
             self.assertEqual(len(nodes_data['features']), 0)
 
+    def test_clean(self):
+        osm_graph = OSMGraph.from_geojson(self.nodes_geojson, self.edges_geojson)
+        self.assertEqual(len(osm_graph.G.nodes), 3)
+        osm_graph.clean()
+        self.assertFalse(hasattr(osm_graph, 'G'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_osw_incline.py b/tests/test_osw_incline.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.0.1'`
	`1`	`+__version__ = '0.0.2'`