File tree Expand file tree Collapse file tree 4 files changed +2
-24
lines changed
Expand file tree Collapse file tree 4 files changed +2
-24
lines changed Original file line number Diff line number Diff line change @@ -40,7 +40,6 @@ babel==2.15.0
4040beautifulsoup4==4.12.3
4141 # via furo
4242 # via google
43- # via markdownify
4443 # via scrapegraphai
4544blinker==1.8.2
4645 # via streamlit
@@ -246,18 +245,13 @@ lxml==5.2.2
246245lxml-html-clean==0.1.1
247246 # via lxml
248247markdown-it-py==3.0.0
249- # via mdformat
250248 # via rich
251- markdownify==0.12.1
252- # via scrapegraphai
253249markupsafe==2.1.5
254250 # via jinja2
255251marshmallow==3.21.3
256252 # via dataclasses-json
257253matplotlib==3.9.0
258254 # via burr
259- mdformat==0.7.17
260- # via scrapegraphai
261255mdurl==0.1.2
262256 # via markdown-it-py
263257minify-html==0.15.0
@@ -347,8 +341,6 @@ pygments==2.18.0
347341 # via furo
348342 # via rich
349343 # via sphinx
350- pyhtml2md==1.6.0
351- # via scrapegraphai
352344pyparsing==3.1.2
353345 # via httplib2
354346 # via matplotlib
@@ -409,7 +401,6 @@ sf-hamilton==1.66.1
409401shellingham==1.5.4
410402 # via typer
411403six==1.16.0
412- # via markdownify
413404 # via python-dateutil
414405smmap==5.0.1
415406 # via gitdb
Original file line number Diff line number Diff line change @@ -28,7 +28,6 @@ babel==2.15.0
2828 # via courlan
2929beautifulsoup4==4.12.3
3030 # via google
31- # via markdownify
3231 # via scrapegraphai
3332boto3==1.34.129
3433 # via langchain-aws
@@ -175,16 +174,8 @@ lxml==5.2.2
175174 # via trafilatura
176175lxml-html-clean==0.1.1
177176 # via lxml
178- markdown-it-py==3.0.0
179- # via mdformat
180- markdownify==0.12.1
181- # via scrapegraphai
182177marshmallow==3.21.3
183178 # via dataclasses-json
184- mdformat==0.7.17
185- # via scrapegraphai
186- mdurl==0.1.2
187- # via markdown-it-py
188179minify-html==0.15.0
189180 # via scrapegraphai
190181multidict==6.0.5
@@ -238,8 +229,6 @@ pydantic-core==2.18.4
238229 # via pydantic
239230pyee==11.1.0
240231 # via playwright
241- pyhtml2md==1.6.0
242- # via scrapegraphai
243232pyparsing==3.1.2
244233 # via httplib2
245234python-dateutil==2.9.0.post0
@@ -275,7 +264,6 @@ s3transfer==0.10.1
275264semchunk==1.0.1
276265 # via scrapegraphai
277266six==1.16.0
278- # via markdownify
279267 # via python-dateutil
280268sniffio==1.3.1
281269 # via anthropic
Original file line number Diff line number Diff line change 99import requests
1010from langchain_community .document_loaders import PyPDFLoader
1111from langchain_core .documents import Document
12-
12+ from .. utils . cleanup_html import cleanup_html
1313from ..docloaders import ChromiumLoader
1414from ..utils .convert_to_md import convert_to_md
1515from ..utils .logging import get_logger
@@ -164,7 +164,7 @@ def execute(self, state):
164164 if not response .text .strip ():
165165 raise ValueError ("No HTML body content found in the response." )
166166
167- parsed_content = source
167+ parsed_content = cleanup_html ( response , source )
168168
169169 if isinstance (self .llm_model , OpenAI ) and not self .script_creator or self .force and not self .script_creator :
170170 parsed_content = convert_to_md (source )
Original file line number Diff line number Diff line change 22convert_to_md modul
33"""
44import html2text
5- import mdformat
65from trafilatura import extract
76
87
You can’t perform that action at this time.
0 commit comments