diff --git a/backend/mainService/app.py b/backend/mainService/app.py index a40d7a9..5dab465 100644 --- a/backend/mainService/app.py +++ b/backend/mainService/app.py @@ -1,59 +1,34 @@ +import os from fastapi import FastAPI -from contextlib import asynccontextmanager -from src.config.playwright_driver import PlaywrightDriver as ASD -from src.config.async_http_session import AsyncHTTPClient +from fastapi.middleware.cors import CORSMiddleware +from src.config.startup import startup_event from src.controllers.citation_controller import router as citation_router from src.controllers.health_controller import router as health_router -from src.llm.Pinecone import PineconeOperations -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.llm.chat_llm.Azure_llm import Citation -from src.utils.index_operation import start -from dotenv import load_dotenv -from src.scraper.async_content_scraper import AsyncContentScraper -from fastapi.middleware.cors import CORSMiddleware -import nltk -from src.utils.concurrent_resources import cleanup_resources - +# Detect if running in Azure Functions (serverless) +IS_SERVERLESS = os.getenv("SERVERLESS").lower() == "true" origins = [ "http://localhost:5173", # Frontend running on localhost (React, Vue, etc.) "https://cite-me.vercel.app" ] +# Conditionally assign lifespan +lifespan = startup_event if not IS_SERVERLESS else None -@asynccontextmanager -async def startup_event(app: FastAPI): - load_dotenv() - nltk.download('punkt') - nltk.download('punkt_tab') - - app.state.playwright_driver = await ASD.create() - app.state.pc = await PineconeOperations.create() - app.state.summarize_llm = Summarize_llm() - app.state.citation_llm = Citation() - # Initialize the async content scraper using its async context manager - async with AsyncContentScraper(playwright_driver=app.state.playwright_driver) as content_scraper: - app.state.async_content_scraper = content_scraper - start() - yield - # Exiting the async with block automatically calls __aexit__ - await app.state.playwright_driver.quit() - await app.state.pc.cleanup() - await AsyncHTTPClient.close_session() - cleanup_resources() # Clean up thread pool and other concurrent resources - - -app = FastAPI(lifespan=startup_event) +# Create FastAPI instance +app = FastAPI(title="Citation API", version="1.0.0", lifespan=lifespan) +# Middleware configuration app.add_middleware( CORSMiddleware, - allow_origins=["*"], # Allow specific origins. modify this to allow only the your desired origins - allow_credentials=True, # Allow cookies & authentication headers - allow_methods=["POST", "GET", "OPTIONS", "HEAD"], # Allow all HTTP methods (GET, POST, PUT, DELETE, etc.) - allow_headers=["*"], # Allow all headers + allow_origins=["*"], + allow_credentials=True, + allow_methods=["POST", "GET", "OPTIONS", "HEAD"], + allow_headers=["*"], ) -# Include routers with prefixes +# Include routers app.include_router(health_router, tags=["Health"]) app.include_router(citation_router, prefix="/citation", tags=["Citation"]) + diff --git a/backend/mainService/requirements-test.txt b/backend/mainService/requirements-test.txt index 47d0b60..cd9c012 100644 --- a/backend/mainService/requirements-test.txt +++ b/backend/mainService/requirements-test.txt @@ -1,4 +1,4 @@ -pytest -pytest-asyncio -pytest-cov -pytest-mock \ No newline at end of file +pytest==8.3.5 +pytest-asyncio==0.26.0 +pytest-cov==4.1.0 +pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/mainService/src/config/log_config.py b/backend/mainService/src/config/log_config.py index fb4a537..63baee3 100644 --- a/backend/mainService/src/config/log_config.py +++ b/backend/mainService/src/config/log_config.py @@ -1,3 +1,24 @@ +""" +Logging Configuration Module + +This module handles the configuration of the application's logging system. +It sets up both file(optional) and stream handlers with a standardized format for +consistent logging throughout the application. + +Key Functions: +- get_logger: Returns a configured logger instance + +Configuration: +- Log level: INFO +- Log format: Timestamp - Logger Name - Level - Message +- Handlers: File handler + +Features: +- Centralized logging configuration +- Easy logger instance creation +- Both file and stream output +- Standardized log format +""" import os import logging from datetime import datetime @@ -8,28 +29,37 @@ def setup_logging( log_level=logging.INFO, log_dir: str = 'logs', - filename: Optional[str] = 'log') -> Logger: + filename: Optional[str] = 'log', + logToFile: Optional[bool] = False, + ) -> Logger: + """ Set up a standardized logging configuration for the entire project. Args: log_level (int): Logging level (default: logging.INFO) log_dir (str): Directory to store log files (default: 'logs') + filename (str): Base filename for log files (default: 'log') + logToFile (bool): Whether to log to file (default: False) """ - # Ensure logs directory exists - os.makedirs(log_dir, exist_ok=True) - # Create a unique log filename with timestamp timestamp = datetime.now().strftime("%Y%m%U") - log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') # Configure logging logging.basicConfig( level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ - logging.FileHandler(log_filename), # Log to file logging.StreamHandler() # Also log to console ] ) - return logging.getLogger(filename) + logger = logging.getLogger(filename) + + if logToFile: + # Ensure logs directory exists + os.makedirs(log_dir, exist_ok=True) + log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') + logger.addHandler(logging.FileHandler(log_filename)) + + + return logger diff --git a/backend/mainService/src/config/startup.py b/backend/mainService/src/config/startup.py new file mode 100644 index 0000000..807310a --- /dev/null +++ b/backend/mainService/src/config/startup.py @@ -0,0 +1,31 @@ +from src.llm.Pinecone import PineconeOperations +from src.llm.chat_llm.Groq_llm import Summarize_llm +from src.llm.chat_llm.Azure_llm import Citation +from dotenv import load_dotenv +from src.scraper.async_content_scraper import AsyncContentScraper +import nltk +from src.utils.concurrent_resources import cleanup_resources +from contextlib import asynccontextmanager +from src.config.playwright_driver import PlaywrightDriver as ASD +from src.config.async_http_session import AsyncHTTPClient +from fastapi import FastAPI + +@asynccontextmanager +async def startup_event(app: FastAPI): + load_dotenv() + nltk.download('punkt') + nltk.download('punkt_tab') + + app.state.playwright_driver = await ASD.create() + app.state.pc = await PineconeOperations.create() + app.state.summarize_llm = Summarize_llm() + app.state.citation_llm = Citation() + # Initialize the async content scraper using its async context manager + async with AsyncContentScraper(playwright_driver=app.state.playwright_driver) as content_scraper: + app.state.async_content_scraper = content_scraper + yield + # Exiting the async with block automatically calls __aexit__ + await app.state.playwright_driver.quit() + await app.state.pc.cleanup() + await AsyncHTTPClient.close_session() + cleanup_resources() # Clean up thread pool and other concurrent resources diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py index 1f63f6a..a0042da 100644 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ b/backend/mainService/src/scraper/async_content_scraper.py @@ -99,7 +99,6 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): try: if self._context: - await self.scraper_driver.quit() await self._context.close() except Exception as e: # Log the exception even if it occurred during cleanup diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index f9a3aa3..cbd70be 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -10,11 +10,9 @@ from src.config.log_config import setup_logging from src.llm.chat_llm.Azure_llm import Citation from src.config.config import LlmConfig as LLMEC -from src.config.config import concurrency_config, search_config +from src.config.config import search_config,scraper_config from src.custom_exceptions.llm_exceptions import CitationGenerationError from src.llm.embedding_utils.reranker import rerank, format_for_rerank -from src.utils.index_operation import add_index_to_memory -from concurrent.futures import ThreadPoolExecutor from langchain_core.documents import Document from src.services.source_credibility_metric_service import get_credibility_metrics, calculate_overall_score from src.models.schema import Source @@ -23,9 +21,6 @@ log_filename = os.path.basename(__file__) logger = setup_logging(filename=log_filename) -_index_executor = ThreadPoolExecutor( - max_workers=concurrency_config.HANDLE_INDEX_DELETE_WORKERS) - class CitationService: """ @@ -229,7 +224,7 @@ async def _process_documents( try: cleaned_result = search_results["cleaned_result"] - async with asyncio.timeout(15): # 15 second timeout + async with asyncio.timeout((scraper_config.TIMEOUT_DURATION*2)/1000): # 20 second timeout download_results = await self.scraper.get_pdfs( target_urls=cleaned_result.get("links"), storage_path=search_results["search_key"] @@ -301,8 +296,6 @@ async def _create_and_populate_index( if not index: logger.exception("Index creation failed") return False - # Add index to memory - _index_executor.submit(add_index_to_memory, index_name) # Populate index return await self._populate_index(processed_docs["batches"]) diff --git a/backend/mainService/src/utils/index_operation.py b/backend/mainService/src/utils/index_operation.py deleted file mode 100644 index 178ed59..0000000 --- a/backend/mainService/src/utils/index_operation.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import threading -import os -import time -from datetime import datetime, timezone -from src.config.log_config import setup_logging - -INDEX_DICT_FILE = 'index_dict.json' -INDEX_TEMP_FILE = 'index_dict.tmp' # the temp file - -# In-memory dictionary to store index creation times. -index_dict = {} - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - - -def add_index_to_memory(index_name: str): - """Add a new index creation event to the in-memory dictionary.""" - creation_time = datetime.now(timezone.utc).strftime("%Y-%m-%d %H") - indexes: list = index_dict.get(creation_time, []) - indexes.append(index_name) - index_dict[creation_time] = indexes - logger.info(f"Added {index_name} at {creation_time}") - - -def flush_index_dict_periodically(interval: int = 60): - """Flush the in-memory dictionary to disk every `interval` seconds.""" - while True: - time.sleep(interval) - try: - with open(INDEX_TEMP_FILE, 'w') as f: - json.dump(index_dict, f) - os.replace(INDEX_TEMP_FILE, INDEX_DICT_FILE) - index_dict.clear() - logger.info(f"Flushed {len(index_dict)} indexes to disk.") - except Exception as e: - logger.exception(f"Error flushing index dict: {e}") - - -def start(): - # Start the background flushing thread (as a daemon so it doesn't block - # program exit) - flush_thread = threading.Thread( - target=flush_index_dict_periodically, args=( - 180,), daemon=True) - flush_thread.start() - - -# Example usage when creating an index: -if __name__ == "__main__": - # Whenever you create a new index, just call: - new_index_name = "example_file.pdf" - add_index_to_memory(new_index_name) - - # Your main process can continue doing other tasks... - # For demonstration, keep the script running so the background thread - # works. - while True: - time.sleep(10) diff --git a/backend/metricsService/src/main.py b/backend/metricsService/main.py similarity index 100% rename from backend/metricsService/src/main.py rename to backend/metricsService/main.py diff --git a/backend/metricsService/requirements-test.txt b/backend/metricsService/requirements-test.txt index c166a61..f17b55e 100644 --- a/backend/metricsService/requirements-test.txt +++ b/backend/metricsService/requirements-test.txt @@ -1,5 +1,5 @@ -pytest==7.4.3 -pytest-asyncio==0.21.1 +pytest==8.3.5 +pytest-asyncio==0.26.0 pytest-cov==4.1.0 httpx==0.25.2 pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/metricsService/src/utils/logging_config.py b/backend/metricsService/src/utils/logging_config.py index ef9418e..a84baf4 100644 --- a/backend/metricsService/src/utils/logging_config.py +++ b/backend/metricsService/src/utils/logging_config.py @@ -2,7 +2,7 @@ Logging Configuration Module This module handles the configuration of the application's logging system. -It sets up both file and stream handlers with a standardized format for +It sets up both file(optional) and stream handlers with a standardized format for consistent logging throughout the application. Key Functions: @@ -11,7 +11,7 @@ Configuration: - Log level: INFO - Log format: Timestamp - Logger Name - Level - Message -- Handlers: File handler (app.log) +- Handlers: File handler Features: - Centralized logging configuration @@ -20,16 +20,61 @@ - Standardized log format """ +import os import logging +from datetime import datetime +from typing import Optional +from logging import Logger -file_handler = logging.FileHandler('app.log') -stream_handler = logging.StreamHandler() +logger = None # Global logger instance -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[file_handler, stream_handler] -) +def setup_logging( + log_level=logging.INFO, + log_dir: str = 'logs', + filename: Optional[str] = 'log', + logToFile: Optional[bool] = False, + ) -> Logger: + + """ + Set up a standardized logging configuration for the entire project. + + Args: + log_level (int): Logging level (default: logging.INFO) + log_dir (str): Directory to store log files (default: 'logs') + filename (str): Base filename for log files (default: 'log') + logToFile (bool): Whether to log to file (default: False) + """ + + global logger + # Create a unique log filename with timestamp + timestamp = datetime.now().strftime("%Y%m%U") + + # Configure logging + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler() # Also log to console + ] + ) + logger = logging.getLogger(filename) + + if logToFile: + # Ensure logs directory exists + os.makedirs(log_dir, exist_ok=True) + log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') + logger.addHandler(logging.FileHandler(log_filename)) + + +def get_logger(filename:str) -> Logger: + """ + Get the configured logger instance. + + Returns: + Logger: Configured logger instance + """ + global logger + if logger is None: + setup_logging(filename=filename) + return logger -def get_logger(name): - return logging.getLogger(name) diff --git a/backend/metricsService/tests/api/test_endpoints.py b/backend/metricsService/tests/api/test_endpoints.py index 0750cc1..4530bac 100644 --- a/backend/metricsService/tests/api/test_endpoints.py +++ b/backend/metricsService/tests/api/test_endpoints.py @@ -1,6 +1,6 @@ import pytest from fastapi.testclient import TestClient -from src.main import app +from main import app from unittest.mock import AsyncMock, patch diff --git a/backend/metricsService/tests/conftest.py b/backend/metricsService/tests/conftest.py index 498fc2e..5721bbb 100644 --- a/backend/metricsService/tests/conftest.py +++ b/backend/metricsService/tests/conftest.py @@ -1,6 +1,6 @@ import pytest from fastapi.testclient import TestClient -from src.main import app +from main import app @pytest.fixture def test_client():