Skip to content

Commit 9917972

Browse files
committed
fixed request
1 parent d8fcb6c commit 9917972

File tree

4 files changed

+2
-24
lines changed

4 files changed

+2
-24
lines changed

requirements-dev.lock

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ babel==2.15.0
4040
beautifulsoup4==4.12.3
4141
# via furo
4242
# via google
43-
# via markdownify
4443
# via scrapegraphai
4544
blinker==1.8.2
4645
# via streamlit
@@ -246,18 +245,13 @@ lxml==5.2.2
246245
lxml-html-clean==0.1.1
247246
# via lxml
248247
markdown-it-py==3.0.0
249-
# via mdformat
250248
# via rich
251-
markdownify==0.12.1
252-
# via scrapegraphai
253249
markupsafe==2.1.5
254250
# via jinja2
255251
marshmallow==3.21.3
256252
# via dataclasses-json
257253
matplotlib==3.9.0
258254
# via burr
259-
mdformat==0.7.17
260-
# via scrapegraphai
261255
mdurl==0.1.2
262256
# via markdown-it-py
263257
minify-html==0.15.0
@@ -347,8 +341,6 @@ pygments==2.18.0
347341
# via furo
348342
# via rich
349343
# via sphinx
350-
pyhtml2md==1.6.0
351-
# via scrapegraphai
352344
pyparsing==3.1.2
353345
# via httplib2
354346
# via matplotlib
@@ -409,7 +401,6 @@ sf-hamilton==1.66.1
409401
shellingham==1.5.4
410402
# via typer
411403
six==1.16.0
412-
# via markdownify
413404
# via python-dateutil
414405
smmap==5.0.1
415406
# via gitdb

requirements.lock

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ babel==2.15.0
2828
# via courlan
2929
beautifulsoup4==4.12.3
3030
# via google
31-
# via markdownify
3231
# via scrapegraphai
3332
boto3==1.34.129
3433
# via langchain-aws
@@ -175,16 +174,8 @@ lxml==5.2.2
175174
# via trafilatura
176175
lxml-html-clean==0.1.1
177176
# via lxml
178-
markdown-it-py==3.0.0
179-
# via mdformat
180-
markdownify==0.12.1
181-
# via scrapegraphai
182177
marshmallow==3.21.3
183178
# via dataclasses-json
184-
mdformat==0.7.17
185-
# via scrapegraphai
186-
mdurl==0.1.2
187-
# via markdown-it-py
188179
minify-html==0.15.0
189180
# via scrapegraphai
190181
multidict==6.0.5
@@ -238,8 +229,6 @@ pydantic-core==2.18.4
238229
# via pydantic
239230
pyee==11.1.0
240231
# via playwright
241-
pyhtml2md==1.6.0
242-
# via scrapegraphai
243232
pyparsing==3.1.2
244233
# via httplib2
245234
python-dateutil==2.9.0.post0
@@ -275,7 +264,6 @@ s3transfer==0.10.1
275264
semchunk==1.0.1
276265
# via scrapegraphai
277266
six==1.16.0
278-
# via markdownify
279267
# via python-dateutil
280268
sniffio==1.3.1
281269
# via anthropic

scrapegraphai/nodes/fetch_node.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import requests
1010
from langchain_community.document_loaders import PyPDFLoader
1111
from langchain_core.documents import Document
12-
12+
from ..utils.cleanup_html import cleanup_html
1313
from ..docloaders import ChromiumLoader
1414
from ..utils.convert_to_md import convert_to_md
1515
from ..utils.logging import get_logger
@@ -164,7 +164,7 @@ def execute(self, state):
164164
if not response.text.strip():
165165
raise ValueError("No HTML body content found in the response.")
166166

167-
parsed_content = source
167+
parsed_content = cleanup_html(response, source)
168168

169169
if isinstance(self.llm_model, OpenAI) and not self.script_creator or self.force and not self.script_creator:
170170
parsed_content = convert_to_md(source)

scrapegraphai/utils/convert_to_md.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
convert_to_md modul
33
"""
44
import html2text
5-
import mdformat
65
from trafilatura import extract
76

87

0 commit comments

Comments
 (0)