diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..38cd5be --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +.gitignore merge=ours +README.md merge=ours +docker-compose.yml merge=ours diff --git a/.gitignore b/.gitignore index 1fbafed..cbca715 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,146 @@ testing_workflow.py *.yaml scripts/ +playwright_browser +local.settings.json +function_app/ +downloads/ +*.pdf + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don’t work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Azure Functions artifacts +bin +obj +appsettings.json +local.settings.json + +# Azurite artifacts +__blobstorage__ +__queuestorage__ +__azurite_db*__.json +.python_packages + +playwright_browser/ \ No newline at end of file diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt index a5189a8..04a3d93 100644 --- a/backend/mainService/requirements.txt +++ b/backend/mainService/requirements.txt @@ -27,4 +27,5 @@ uvicorn httpx>=0.28.1 pypdf pypdf2 +azure-functions diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py index 7ca0a07..778a5a4 100644 --- a/backend/mainService/src/config/config.py +++ b/backend/mainService/src/config/config.py @@ -32,13 +32,24 @@ class ScraperConfig: """ This is the timeout duration for the requests made to the web scraper """ - TIMEOUT_DURATION: int = 10000 + TIMEOUT_DURATION: int = 10000 # Increased from 10000 to 30000 (30 seconds) + + """ + This is the path to the directory where the downloads will be stored. + """ + MAIN_DOWNLOADS_DIR_PATH: str = os.path.join("/tmp", "downloads") + + """ + This is the path to the playwright executable. + """ + PLAYWRIGHT_EXE_PATH=None # set to None if you want to use the default playwright executable def __post_init__(self): if self.MAX_FILE_SIZE <= 0: raise ValueError("MAX_FILE_SIZE must be positive") if self.TIMEOUT_DURATION <= 0: raise ValueError("TIMEOUT_DURATION must be positive") + os.makedirs(self.MAIN_DOWNLOADS_DIR_PATH, exist_ok=True) @dataclass @@ -85,14 +96,6 @@ class LlmConfig: """ UPSERT_BATCH_SIZE: int = 1000 - """ - This is the llm that open router uses for generating the intext citation and reference list for each query - """ - OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free" - - """ - This is the azure model api endpoint - """ # Concurrency and Performance @@ -101,12 +104,24 @@ class ConcurrencyConfig: """Configuration class for concurrency settings.""" # General concurrency settings + """ + This is the number of concurrent workers that will be used to process the source documents. + """ DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1 - HANDLE_INDEX_DELETE_WORKERS: int = 2 - # Credibility service specific settings + """ + This is the maximum number of threads that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_MAX_THREADS: int = 4 # Maximum threads for credibility calculations + + """ + This is the maximum number of concurrent operations that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_MAX_CONCURRENT: int = 8 # Maximum concurrent operations + + """ + This is the size of the processing batches that will be used to calculate the credibility of the source documents. + """ CREDIBILITY_BATCH_SIZE: int = 4 # Size of processing batches @@ -117,13 +132,23 @@ class ModelConfig: Contains settings specific to AI models and their deployment.""" """Configuration for ML models and APIs.""" - MODEL_ID: str = "BAAI/bge-m3" - MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}" - # LLM Generation Parameters - DEFAULT_TEMPERATURE: float = 0.5 - DEFAULT_TOP_P: float = 1.0 - DEFAULT_MAX_TOKENS: int = 1024 + """ + This is the temperature for the citation LLM. + """ + CITE_LLM_TEMPERATURE: float = 0.1 + """ + This is the temperature for the summarize LLM. + """ + SUMMARIZE_LLM_TEMPERATURE: float = 0.9 + """ + This is the top p for the citation LLM. + """ + CITE_LLM_TOP_P: float = 0.1 + """ + This is the top p for the summarize LLM. + """ + SUMMARIZE_LLM_TOP_P: float = 0.1 @dataclass diff --git a/backend/mainService/src/config/playwright_driver.py b/backend/mainService/src/config/playwright_driver.py index 1eb5e28..10d8ea1 100644 --- a/backend/mainService/src/config/playwright_driver.py +++ b/backend/mainService/src/config/playwright_driver.py @@ -129,8 +129,9 @@ async def __initialize_browser(self) -> Browser: "--disable-blink-features=AutomationControlled", ] try: + exe_path = scraper_config.PLAYWRIGHT_EXE_PATH or None self._playwright = await async_playwright().start() - self._browser = await self._playwright.chromium.launch(headless=True, args=args) + self._browser = await self._playwright.chromium.launch(headless=True, args=args, executable_path=exe_path) except Exception as e: logger.critical(f"Error while initializing browser: {e}") raise e diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py index 1e8bb84..dd346b8 100644 --- a/backend/mainService/src/llm/chat_llm/Azure_llm.py +++ b/backend/mainService/src/llm/chat_llm/Azure_llm.py @@ -15,7 +15,7 @@ from src.custom_exceptions.llm_exceptions import CitationGenerationError import logging from concurrent.futures import ThreadPoolExecutor -from src.config.config import concurrency_config +from src.config.config import concurrency_config, model_config logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( logging.WARNING) @@ -128,8 +128,11 @@ def _blocking_citation_request( Dict[str, Any]: Raw API response containing citation data """ try: - response: ChatCompletions = self.client.complete(messages=messages, model=( - model_name or self.model_name), temperature=0.1, top_p=0.1) + response: ChatCompletions = self.client.complete( + messages=messages, + model=(model_name or self.model_name), + temperature=model_config.CITE_LLM_TEMPERATURE, + top_p=model_config.CITE_LLM_TOP_P) response_content = response.choices[0].message.content # amazonq-ignore-next-line response_content = response_content.strip() diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py index a2a6370..e35e42b 100644 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ b/backend/mainService/src/llm/chat_llm/Groq_llm.py @@ -6,6 +6,7 @@ from typing import Optional from json.decoder import JSONDecodeError from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError +from src.config.config import model_config filename = os.path.basename(__file__) logger = setup_logging(filename=filename) @@ -59,9 +60,9 @@ def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = No "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}" }, ], - temperature=0.9, - top_p=1, - max_tokens=1024, + temperature=model_config.SUMMARIZE_LLM_TEMPERATURE, + top_p=model_config.SUMMARIZE_LLM_TOP_P, + max_tokens=200, stream=False, stop=None, response_format={"type": "json_object"} diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py index a0042da..fead8df 100644 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ b/backend/mainService/src/scraper/async_content_scraper.py @@ -30,11 +30,13 @@ from playwright.async_api import Browser, BrowserContext from src.config.log_config import setup_logging from datetime import timezone as tz +from src.config.config import scraper_config log_filename = os.path.basename(__file__) logger = setup_logging(filename=log_filename) + """ Citation Content Scraper Module @@ -136,15 +138,19 @@ async def get_pdf(self, parsed_url = parse_url(target_url) base_url = f"{parsed_url.scheme}://{parsed_url.host}" - # Set up download path + # Set up download path in the main downloads directory if not storage_path: - default_path = parsed_url.host + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) - storage_path = os.path.join( - os.getcwd(), "downloads", default_path) + # Create a subdirectory for this request + request_dir = os.path.join( + scraper_config.MAIN_DOWNLOADS_DIR_PATH, + f"{parsed_url.host}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}" + ) + storage_path = request_dir else: - storage_path = os.path.abspath(storage_path) - + # If storage_path is provided, create it as a subdirectory of MAIN_DOWNLOADS_DIR + storage_path = os.path.join(scraper_config.MAIN_DOWNLOADS_DIR_PATH, storage_path) + + storage_path = os.path.abspath(storage_path) self.current_download_path = storage_path # Check robots.txt @@ -187,8 +193,9 @@ async def get_pdfs(self, """ results = {"count": 0, "paths": {}, "storage_path": None} - storage_path = storage_path + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None + # Create a unique subdirectory for this batch of downloads + if storage_path: + storage_path = f"{storage_path}_{datetime.now(tz.utc).strftime('%d_%m_%Y_%H_%M_%S')}" # Create tasks for all downloads tasks = [self.get_pdf(url, storage_path) for url in target_urls] diff --git a/backend/metricsService/.funcignore b/backend/metricsService/.funcignore new file mode 100644 index 0000000..7dda614 --- /dev/null +++ b/backend/metricsService/.funcignore @@ -0,0 +1,8 @@ +.git* +.vscode +__azurite_db*__.json +__blobstorage__ +__queuestorage__ +local.settings.json +test +venv \ No newline at end of file diff --git a/backend/metricsService/function_app/__init__.py b/backend/metricsService/function_app/__init__.py new file mode 100644 index 0000000..96dc26c --- /dev/null +++ b/backend/metricsService/function_app/__init__.py @@ -0,0 +1,11 @@ +import azure.functions as func +import logging +from main import app as fastapi_app # Import the FastAPI app from app.py +from dotenv import load_dotenv + +load_dotenv() + +async def main(req: func.HttpRequest, res:func.Out[func.HttpResponse]) -> func.HttpResponse: + logging.info('Python HTTP trigger function processed a request.') + response = await func.AsgiMiddleware(app=fastapi_app).handle_async(req) + res.set(response) \ No newline at end of file diff --git a/backend/metricsService/function_app/function.json b/backend/metricsService/function_app/function.json new file mode 100644 index 0000000..242db6a --- /dev/null +++ b/backend/metricsService/function_app/function.json @@ -0,0 +1,17 @@ +{ + "bindings": [ + { + "authLevel": "anonymous", + "type": "httpTrigger", + "direction": "in", + "name": "req", + "methods": ["get", "post"], + "route": "{*route}" + }, + { + "type": "http", + "direction": "out", + "name": "res" + } + ] +} \ No newline at end of file diff --git a/backend/metricsService/host.json b/backend/metricsService/host.json new file mode 100644 index 0000000..e3b6a9a --- /dev/null +++ b/backend/metricsService/host.json @@ -0,0 +1,21 @@ +{ + "version": "2.0", + "logging": { + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "excludedTypes": "Request" + } + } + }, + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "extensions": { + "http": { + "routePrefix": "", + "maxOutstandingRequests": 100 + } + } +} \ No newline at end of file diff --git a/backend/metricsService/local.settings.json b/backend/metricsService/local.settings.json new file mode 100644 index 0000000..4b4cfce --- /dev/null +++ b/backend/metricsService/local.settings.json @@ -0,0 +1,7 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "", + "FUNCTIONS_WORKER_RUNTIME": "python" + } +} \ No newline at end of file diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt index 3d56cd0..cb4dba7 100644 --- a/backend/metricsService/requirements.txt +++ b/backend/metricsService/requirements.txt @@ -7,4 +7,4 @@ python-dotenv==1.0.1 Requests==2.32.3 scholarly==1.7.11 uvicorn - +azure-functions