@@ -50,35 +50,48 @@ def execute(self, state: dict) -> dict:
5050
5151 Args:
5252 state (dict): The current state of the graph. The input keys will be used to fetch the
53- correct data from the state.
53+ correct data from the state.
5454
5555 Returns:
5656 dict: The updated state with the output key containing the parsed content chunks.
5757
5858 Raises:
59- KeyError: If the input keys are not found in the state.
59+ KeyError: If the input keys are not found in the state, indicating that the
60+ necessary information for parsing the content is missing.
6061 """
6162
6263 self .logger .info (f"--- Executing { self .node_name } Node ---" )
6364
64- # Fetch data using input keys
65+ # Interpret input keys based on the provided input expression
6566 input_keys = self .get_input_keys (state )
67+
68+ # Fetching data from the state based on the input keys
6669 input_data = [state [key ] for key in input_keys ]
70+ # Parse the document
6771 docs_transformed = input_data [0 ]
68-
69- # Parse HTML if enabled
7072 if self .parse_html :
7173 docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
7274 docs_transformed = docs_transformed [0 ]
7375
74- # Get text content
75- text_content = docs_transformed . page_content if type ( docs_transformed ) == Document else docs_transformed
76-
77- # Chunk the text
78- chunk_size = self . node_config . get ( "chunk_size" , 4096 ) - 250
79- chunks = chunk ( text = text_content , chunk_size = chunk_size , token_counter = lambda x : len ( x . split ()), memoize = False )
76+ chunks = chunk ( text = docs_transformed . page_content ,
77+ chunk_size = self . node_config . get ( "chunk_size" , 4096 ),
78+ token_counter = lambda x : len ( x ),
79+ memoize = False )
80+ else :
81+ docs_transformed = docs_transformed [ 0 ]
8082
81- # Update state with chunks
83+ if type (docs_transformed ) == Document :
84+ chunks = chunk (text = docs_transformed .page_content ,
85+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
86+ token_counter = lambda x : len (x ),
87+ memoize = False )
88+ else :
89+
90+ chunks = chunk (text = docs_transformed ,
91+ chunk_size = self .node_config .get ("chunk_size" , 4096 ),
92+ token_counter = lambda x : len (x ),
93+ memoize = False )
94+
8295 state .update ({self .output [0 ]: chunks })
8396
84- return state
97+ return state
0 commit comments