fix: parse_node

VinciGit00 · VinciGit00 · commit 07f1e23d235d · 2024-07-17T22:58:21.000+02:00
diff --git a/examples/local_models/smart_scraper_ollama.py b/examples/local_models/smart_scraper_ollama.py
@@ -29,7 +29,7 @@
 
 smart_scraper_graph = SmartScraperGraph(
     prompt="List me all the titles",
-    source="https://sport.sky.it/nba?gr=www",
+    source="https://perinim.github.io/projects",
     config=graph_config
 )
 
diff --git a/scrapegraphai/nodes/base_node.py b/scrapegraphai/nodes/base_node.py
@@ -88,7 +88,6 @@ def update_config(self, params: dict, overwrite: bool = False):
             param (dict): The dictionary to update node_config with.
             overwrite (bool): Flag indicating if the values of node_config should be overwritten if their value is not None.
         """
-        
         for key, val in params.items():
             if hasattr(self, key) and not overwrite:
                 continue
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -121,7 +121,7 @@ def execute(self, state: dict) -> dict:
                 answer = chain.invoke({"question": user_prompt})
                 break
 
-            prompt = PromptTemplate(
+                prompt = PromptTemplate(
                     template=template_chunks_prompt,
                     input_variables=["question"],
                     partial_variables={"context": chunk,
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -50,35 +50,48 @@ def execute(self, state: dict) -> dict:
 
         Args:
             state (dict): The current state of the graph. The input keys will be used to fetch the
-                        correct data from the state.
+                            correct data from the state.
 
         Returns:
             dict: The updated state with the output key containing the parsed content chunks.
 
         Raises:
-            KeyError: If the input keys are not found in the state.
+            KeyError: If the input keys are not found in the state, indicating that the
+                        necessary information for parsing the content is missing.
         """
 
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
-        # Fetch data using input keys
+        # Interpret input keys based on the provided input expression
         input_keys = self.get_input_keys(state)
+
+        # Fetching data from the state based on the input keys
         input_data = [state[key] for key in input_keys]
+        # Parse the document
         docs_transformed = input_data[0]
-
-        # Parse HTML if enabled
         if self.parse_html:
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
-        # Get text content
-        text_content = docs_transformed.page_content if type(docs_transformed) == Document else docs_transformed
-
-        # Chunk the text
-        chunk_size = self.node_config.get("chunk_size", 4096) - 250
-        chunks = chunk(text=text_content, chunk_size=chunk_size, token_counter=lambda x: len(x.split()), memoize=False)
+            chunks = chunk(text=docs_transformed.page_content,
+                            chunk_size= self.node_config.get("chunk_size", 4096),
+                            token_counter=lambda x: len(x),
+                            memoize=False)
+        else:
+            docs_transformed = docs_transformed[0]
 
-        # Update state with chunks
+            if type(docs_transformed) == Document:
+                chunks = chunk(text=docs_transformed.page_content,
+                            chunk_size= self.node_config.get("chunk_size", 4096),
+                            token_counter=lambda x: len(x),
+                            memoize=False)
+            else:
+                
+                chunks = chunk(text=docs_transformed,
+                                chunk_size= self.node_config.get("chunk_size", 4096),
+                                token_counter=lambda x: len(x),
+                                memoize=False)
+                          
         state.update({self.output[0]: chunks})
 
-        return state
+        return state

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`
`30`	`30`	`smart_scraper_graph = SmartScraperGraph(`
`31`	`31`	`prompt="List me all the titles",`
`32`		`- source="https://sport.sky.it/nba?gr=www",`
	`32`	`+ source="https://perinim.github.io/projects",`
`33`	`33`	`config=graph_config`
`34`	`34`	`)`
`35`	`35`