From bee6a562f9fe536446265b20317a127acb463abb Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 12:48:13 -0400 Subject: [PATCH 1/5] adjust logging to be receive argument to indicate if you want to log to file as well. Change timeout for get pdf from 15 to 20 seconds and removed magic numbers --- backend/mainService/src/config/log_config.py | 21 ++++++++++++------- .../src/scraper/async_content_scraper.py | 1 - .../src/services/citation_service.py | 4 ++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/backend/mainService/src/config/log_config.py b/backend/mainService/src/config/log_config.py index fb4a537..d86f126 100644 --- a/backend/mainService/src/config/log_config.py +++ b/backend/mainService/src/config/log_config.py @@ -8,7 +8,10 @@ def setup_logging( log_level=logging.INFO, log_dir: str = 'logs', - filename: Optional[str] = 'log') -> Logger: + filename: Optional[str] = 'log', + logToFile: Optional[bool] = False, + ) -> Logger: + """ Set up a standardized logging configuration for the entire project. @@ -16,20 +19,24 @@ def setup_logging( log_level (int): Logging level (default: logging.INFO) log_dir (str): Directory to store log files (default: 'logs') """ - # Ensure logs directory exists - os.makedirs(log_dir, exist_ok=True) - # Create a unique log filename with timestamp timestamp = datetime.now().strftime("%Y%m%U") - log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') # Configure logging logging.basicConfig( level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ - logging.FileHandler(log_filename), # Log to file logging.StreamHandler() # Also log to console ] ) - return logging.getLogger(filename) + logger = logging.getLogger(filename) + + if logToFile: + # Ensure logs directory exists + os.makedirs(log_dir, exist_ok=True) + log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') + logger.addHandler(logging.FileHandler(log_filename)) + + + return logger diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py index 1f63f6a..a0042da 100644 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ b/backend/mainService/src/scraper/async_content_scraper.py @@ -99,7 +99,6 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): try: if self._context: - await self.scraper_driver.quit() await self._context.close() except Exception as e: # Log the exception even if it occurred during cleanup diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index f9a3aa3..28f7bd1 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -10,7 +10,7 @@ from src.config.log_config import setup_logging from src.llm.chat_llm.Azure_llm import Citation from src.config.config import LlmConfig as LLMEC -from src.config.config import concurrency_config, search_config +from src.config.config import concurrency_config, search_config,scraper_config from src.custom_exceptions.llm_exceptions import CitationGenerationError from src.llm.embedding_utils.reranker import rerank, format_for_rerank from src.utils.index_operation import add_index_to_memory @@ -229,7 +229,7 @@ async def _process_documents( try: cleaned_result = search_results["cleaned_result"] - async with asyncio.timeout(15): # 15 second timeout + async with asyncio.timeout((scraper_config.TIMEOUT_DURATION*2)/1000): # 20 second timeout download_results = await self.scraper.get_pdfs( target_urls=cleaned_result.get("links"), storage_path=search_results["search_key"] From 12eaed998e7ee1bd431db03995a9637a793c8083 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 12:52:32 -0400 Subject: [PATCH 2/5] update metric service logging as well as adjust the doc string --- backend/mainService/src/config/log_config.py | 23 ++++++++ .../src/utils/logging_config.py | 54 +++++++++++++++---- 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/backend/mainService/src/config/log_config.py b/backend/mainService/src/config/log_config.py index d86f126..63baee3 100644 --- a/backend/mainService/src/config/log_config.py +++ b/backend/mainService/src/config/log_config.py @@ -1,3 +1,24 @@ +""" +Logging Configuration Module + +This module handles the configuration of the application's logging system. +It sets up both file(optional) and stream handlers with a standardized format for +consistent logging throughout the application. + +Key Functions: +- get_logger: Returns a configured logger instance + +Configuration: +- Log level: INFO +- Log format: Timestamp - Logger Name - Level - Message +- Handlers: File handler + +Features: +- Centralized logging configuration +- Easy logger instance creation +- Both file and stream output +- Standardized log format +""" import os import logging from datetime import datetime @@ -18,6 +39,8 @@ def setup_logging( Args: log_level (int): Logging level (default: logging.INFO) log_dir (str): Directory to store log files (default: 'logs') + filename (str): Base filename for log files (default: 'log') + logToFile (bool): Whether to log to file (default: False) """ # Create a unique log filename with timestamp timestamp = datetime.now().strftime("%Y%m%U") diff --git a/backend/metricsService/src/utils/logging_config.py b/backend/metricsService/src/utils/logging_config.py index ef9418e..dae5d9e 100644 --- a/backend/metricsService/src/utils/logging_config.py +++ b/backend/metricsService/src/utils/logging_config.py @@ -2,7 +2,7 @@ Logging Configuration Module This module handles the configuration of the application's logging system. -It sets up both file and stream handlers with a standardized format for +It sets up both file(optional) and stream handlers with a standardized format for consistent logging throughout the application. Key Functions: @@ -11,7 +11,7 @@ Configuration: - Log level: INFO - Log format: Timestamp - Logger Name - Level - Message -- Handlers: File handler (app.log) +- Handlers: File handler Features: - Centralized logging configuration @@ -20,16 +20,48 @@ - Standardized log format """ +import os import logging +from datetime import datetime +from typing import Optional +from logging import Logger -file_handler = logging.FileHandler('app.log') -stream_handler = logging.StreamHandler() -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[file_handler, stream_handler] -) +def setup_logging( + log_level=logging.INFO, + log_dir: str = 'logs', + filename: Optional[str] = 'log', + logToFile: Optional[bool] = False, + ) -> Logger: + + """ + Set up a standardized logging configuration for the entire project. + + Args: + log_level (int): Logging level (default: logging.INFO) + log_dir (str): Directory to store log files (default: 'logs') + filename (str): Base filename for log files (default: 'log') + logToFile (bool): Whether to log to file (default: False) + """ + # Create a unique log filename with timestamp + timestamp = datetime.now().strftime("%Y%m%U") + + # Configure logging + logging.basicConfig( + level=log_level, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler() # Also log to console + ] + ) + logger = logging.getLogger(filename) + + if logToFile: + # Ensure logs directory exists + os.makedirs(log_dir, exist_ok=True) + log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') + logger.addHandler(logging.FileHandler(log_filename)) + + + return logger -def get_logger(name): - return logging.getLogger(name) From 844f78746c68e1c60f77fc11663a69017132785f Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 13:10:55 -0400 Subject: [PATCH 3/5] update requirements.txt --- backend/mainService/requirements-test.txt | 8 ++++---- backend/metricsService/requirements-test.txt | 4 ++-- backend/metricsService/src/utils/logging_config.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/backend/mainService/requirements-test.txt b/backend/mainService/requirements-test.txt index 47d0b60..cd9c012 100644 --- a/backend/mainService/requirements-test.txt +++ b/backend/mainService/requirements-test.txt @@ -1,4 +1,4 @@ -pytest -pytest-asyncio -pytest-cov -pytest-mock \ No newline at end of file +pytest==8.3.5 +pytest-asyncio==0.26.0 +pytest-cov==4.1.0 +pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/metricsService/requirements-test.txt b/backend/metricsService/requirements-test.txt index c166a61..f17b55e 100644 --- a/backend/metricsService/requirements-test.txt +++ b/backend/metricsService/requirements-test.txt @@ -1,5 +1,5 @@ -pytest==7.4.3 -pytest-asyncio==0.21.1 +pytest==8.3.5 +pytest-asyncio==0.26.0 pytest-cov==4.1.0 httpx==0.25.2 pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/metricsService/src/utils/logging_config.py b/backend/metricsService/src/utils/logging_config.py index dae5d9e..a84baf4 100644 --- a/backend/metricsService/src/utils/logging_config.py +++ b/backend/metricsService/src/utils/logging_config.py @@ -26,6 +26,7 @@ from typing import Optional from logging import Logger +logger = None # Global logger instance def setup_logging( log_level=logging.INFO, @@ -43,6 +44,8 @@ def setup_logging( filename (str): Base filename for log files (default: 'log') logToFile (bool): Whether to log to file (default: False) """ + + global logger # Create a unique log filename with timestamp timestamp = datetime.now().strftime("%Y%m%U") @@ -63,5 +66,15 @@ def setup_logging( logger.addHandler(logging.FileHandler(log_filename)) +def get_logger(filename:str) -> Logger: + """ + Get the configured logger instance. + + Returns: + Logger: Configured logger instance + """ + global logger + if logger is None: + setup_logging(filename=filename) return logger From f698b4e847ab08e8360fd58cbb304d158e7cde0b Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 13:51:54 -0400 Subject: [PATCH 4/5] 1. Updated app.py to allow for easy integration with serverless deployment by creating startup.py and using an env variable to set if environment is serverless or not \n2. Remove index operation, it was used to track indexes in pinecone but no longer needed --- backend/mainService/app.py | 57 +++++----------- .../src/services/citation_service.py | 9 +-- .../mainService/src/utils/index_operation.py | 60 ----------------- backend/metricsService/src/main.py | 67 ------------------- .../tests/api/test_endpoints.py | 2 +- backend/metricsService/tests/conftest.py | 2 +- 6 files changed, 19 insertions(+), 178 deletions(-) delete mode 100644 backend/mainService/src/utils/index_operation.py delete mode 100644 backend/metricsService/src/main.py diff --git a/backend/mainService/app.py b/backend/mainService/app.py index a40d7a9..5dab465 100644 --- a/backend/mainService/app.py +++ b/backend/mainService/app.py @@ -1,59 +1,34 @@ +import os from fastapi import FastAPI -from contextlib import asynccontextmanager -from src.config.playwright_driver import PlaywrightDriver as ASD -from src.config.async_http_session import AsyncHTTPClient +from fastapi.middleware.cors import CORSMiddleware +from src.config.startup import startup_event from src.controllers.citation_controller import router as citation_router from src.controllers.health_controller import router as health_router -from src.llm.Pinecone import PineconeOperations -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.llm.chat_llm.Azure_llm import Citation -from src.utils.index_operation import start -from dotenv import load_dotenv -from src.scraper.async_content_scraper import AsyncContentScraper -from fastapi.middleware.cors import CORSMiddleware -import nltk -from src.utils.concurrent_resources import cleanup_resources - +# Detect if running in Azure Functions (serverless) +IS_SERVERLESS = os.getenv("SERVERLESS").lower() == "true" origins = [ "http://localhost:5173", # Frontend running on localhost (React, Vue, etc.) "https://cite-me.vercel.app" ] +# Conditionally assign lifespan +lifespan = startup_event if not IS_SERVERLESS else None -@asynccontextmanager -async def startup_event(app: FastAPI): - load_dotenv() - nltk.download('punkt') - nltk.download('punkt_tab') - - app.state.playwright_driver = await ASD.create() - app.state.pc = await PineconeOperations.create() - app.state.summarize_llm = Summarize_llm() - app.state.citation_llm = Citation() - # Initialize the async content scraper using its async context manager - async with AsyncContentScraper(playwright_driver=app.state.playwright_driver) as content_scraper: - app.state.async_content_scraper = content_scraper - start() - yield - # Exiting the async with block automatically calls __aexit__ - await app.state.playwright_driver.quit() - await app.state.pc.cleanup() - await AsyncHTTPClient.close_session() - cleanup_resources() # Clean up thread pool and other concurrent resources - - -app = FastAPI(lifespan=startup_event) +# Create FastAPI instance +app = FastAPI(title="Citation API", version="1.0.0", lifespan=lifespan) +# Middleware configuration app.add_middleware( CORSMiddleware, - allow_origins=["*"], # Allow specific origins. modify this to allow only the your desired origins - allow_credentials=True, # Allow cookies & authentication headers - allow_methods=["POST", "GET", "OPTIONS", "HEAD"], # Allow all HTTP methods (GET, POST, PUT, DELETE, etc.) - allow_headers=["*"], # Allow all headers + allow_origins=["*"], + allow_credentials=True, + allow_methods=["POST", "GET", "OPTIONS", "HEAD"], + allow_headers=["*"], ) -# Include routers with prefixes +# Include routers app.include_router(health_router, tags=["Health"]) app.include_router(citation_router, prefix="/citation", tags=["Citation"]) + diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py index 28f7bd1..cbd70be 100644 --- a/backend/mainService/src/services/citation_service.py +++ b/backend/mainService/src/services/citation_service.py @@ -10,11 +10,9 @@ from src.config.log_config import setup_logging from src.llm.chat_llm.Azure_llm import Citation from src.config.config import LlmConfig as LLMEC -from src.config.config import concurrency_config, search_config,scraper_config +from src.config.config import search_config,scraper_config from src.custom_exceptions.llm_exceptions import CitationGenerationError from src.llm.embedding_utils.reranker import rerank, format_for_rerank -from src.utils.index_operation import add_index_to_memory -from concurrent.futures import ThreadPoolExecutor from langchain_core.documents import Document from src.services.source_credibility_metric_service import get_credibility_metrics, calculate_overall_score from src.models.schema import Source @@ -23,9 +21,6 @@ log_filename = os.path.basename(__file__) logger = setup_logging(filename=log_filename) -_index_executor = ThreadPoolExecutor( - max_workers=concurrency_config.HANDLE_INDEX_DELETE_WORKERS) - class CitationService: """ @@ -301,8 +296,6 @@ async def _create_and_populate_index( if not index: logger.exception("Index creation failed") return False - # Add index to memory - _index_executor.submit(add_index_to_memory, index_name) # Populate index return await self._populate_index(processed_docs["batches"]) diff --git a/backend/mainService/src/utils/index_operation.py b/backend/mainService/src/utils/index_operation.py deleted file mode 100644 index 178ed59..0000000 --- a/backend/mainService/src/utils/index_operation.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import threading -import os -import time -from datetime import datetime, timezone -from src.config.log_config import setup_logging - -INDEX_DICT_FILE = 'index_dict.json' -INDEX_TEMP_FILE = 'index_dict.tmp' # the temp file - -# In-memory dictionary to store index creation times. -index_dict = {} - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - - -def add_index_to_memory(index_name: str): - """Add a new index creation event to the in-memory dictionary.""" - creation_time = datetime.now(timezone.utc).strftime("%Y-%m-%d %H") - indexes: list = index_dict.get(creation_time, []) - indexes.append(index_name) - index_dict[creation_time] = indexes - logger.info(f"Added {index_name} at {creation_time}") - - -def flush_index_dict_periodically(interval: int = 60): - """Flush the in-memory dictionary to disk every `interval` seconds.""" - while True: - time.sleep(interval) - try: - with open(INDEX_TEMP_FILE, 'w') as f: - json.dump(index_dict, f) - os.replace(INDEX_TEMP_FILE, INDEX_DICT_FILE) - index_dict.clear() - logger.info(f"Flushed {len(index_dict)} indexes to disk.") - except Exception as e: - logger.exception(f"Error flushing index dict: {e}") - - -def start(): - # Start the background flushing thread (as a daemon so it doesn't block - # program exit) - flush_thread = threading.Thread( - target=flush_index_dict_periodically, args=( - 180,), daemon=True) - flush_thread.start() - - -# Example usage when creating an index: -if __name__ == "__main__": - # Whenever you create a new index, just call: - new_index_name = "example_file.pdf" - add_index_to_memory(new_index_name) - - # Your main process can continue doing other tasks... - # For demonstration, keep the script running so the background thread - # works. - while True: - time.sleep(10) diff --git a/backend/metricsService/src/main.py b/backend/metricsService/src/main.py deleted file mode 100644 index 4c0bcf8..0000000 --- a/backend/metricsService/src/main.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Citation Credibility Service API - -This is the main entry point for the Citation Credibility Service API, which provides -endpoints for analyzing and scoring the credibility of academic citations and sources. - -Key Features: -- RESTful API endpoints for credibility analysis -- CORS support for cross-origin requests -- Versioned API endpoints (/api/v1) -- Health check endpoint -- Configurable through environment variables - -Configuration: -- Environment variables are loaded from .env file -- CORS is configured to allow all origins -- Logging is configured through src.utils.logging_config - -Example Usage: - $ uvicorn src.main:app --reload - -Deployment: - The service can be deployed using any ASGI server (e.g. uvicorn, hypercorn) - and is configured to run on port 8000 by default. -""" - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from src.utils.logging_config import get_logger -from src.api.endpoints import router as api_router -from dotenv import load_dotenv - -# Load environment variables -load_dotenv() - -# Get logger -logger = get_logger(__name__) - - -app = FastAPI( - title="Citation Credibility Service", - description="API for calculating credibility scores of academic sources", - version="1.0.0", -) - -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include versioned routers -app.include_router(api_router, prefix="/api/v1") - -# Health check endpoint -@app.get("/") -async def root(): - return {"message": "Citation Credibility Service is running"} - -# Uncomment to run via uvicorn directly -# if __name__ == "__main__": -# import uvicorn -# port = int(os.environ.get("PORT", 8000)) -# uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True) diff --git a/backend/metricsService/tests/api/test_endpoints.py b/backend/metricsService/tests/api/test_endpoints.py index 0750cc1..4530bac 100644 --- a/backend/metricsService/tests/api/test_endpoints.py +++ b/backend/metricsService/tests/api/test_endpoints.py @@ -1,6 +1,6 @@ import pytest from fastapi.testclient import TestClient -from src.main import app +from main import app from unittest.mock import AsyncMock, patch diff --git a/backend/metricsService/tests/conftest.py b/backend/metricsService/tests/conftest.py index 498fc2e..5721bbb 100644 --- a/backend/metricsService/tests/conftest.py +++ b/backend/metricsService/tests/conftest.py @@ -1,6 +1,6 @@ import pytest from fastapi.testclient import TestClient -from src.main import app +from main import app @pytest.fixture def test_client(): From 59b3c99cbf54329addb818e4eded479116479e9c Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Fri, 28 Mar 2025 13:53:17 -0400 Subject: [PATCH 5/5] add the new files mentioned in the last commit --- backend/mainService/src/config/startup.py | 31 +++++++++++ backend/metricsService/main.py | 67 +++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 backend/mainService/src/config/startup.py create mode 100644 backend/metricsService/main.py diff --git a/backend/mainService/src/config/startup.py b/backend/mainService/src/config/startup.py new file mode 100644 index 0000000..807310a --- /dev/null +++ b/backend/mainService/src/config/startup.py @@ -0,0 +1,31 @@ +from src.llm.Pinecone import PineconeOperations +from src.llm.chat_llm.Groq_llm import Summarize_llm +from src.llm.chat_llm.Azure_llm import Citation +from dotenv import load_dotenv +from src.scraper.async_content_scraper import AsyncContentScraper +import nltk +from src.utils.concurrent_resources import cleanup_resources +from contextlib import asynccontextmanager +from src.config.playwright_driver import PlaywrightDriver as ASD +from src.config.async_http_session import AsyncHTTPClient +from fastapi import FastAPI + +@asynccontextmanager +async def startup_event(app: FastAPI): + load_dotenv() + nltk.download('punkt') + nltk.download('punkt_tab') + + app.state.playwright_driver = await ASD.create() + app.state.pc = await PineconeOperations.create() + app.state.summarize_llm = Summarize_llm() + app.state.citation_llm = Citation() + # Initialize the async content scraper using its async context manager + async with AsyncContentScraper(playwright_driver=app.state.playwright_driver) as content_scraper: + app.state.async_content_scraper = content_scraper + yield + # Exiting the async with block automatically calls __aexit__ + await app.state.playwright_driver.quit() + await app.state.pc.cleanup() + await AsyncHTTPClient.close_session() + cleanup_resources() # Clean up thread pool and other concurrent resources diff --git a/backend/metricsService/main.py b/backend/metricsService/main.py new file mode 100644 index 0000000..4c0bcf8 --- /dev/null +++ b/backend/metricsService/main.py @@ -0,0 +1,67 @@ +""" +Citation Credibility Service API + +This is the main entry point for the Citation Credibility Service API, which provides +endpoints for analyzing and scoring the credibility of academic citations and sources. + +Key Features: +- RESTful API endpoints for credibility analysis +- CORS support for cross-origin requests +- Versioned API endpoints (/api/v1) +- Health check endpoint +- Configurable through environment variables + +Configuration: +- Environment variables are loaded from .env file +- CORS is configured to allow all origins +- Logging is configured through src.utils.logging_config + +Example Usage: + $ uvicorn src.main:app --reload + +Deployment: + The service can be deployed using any ASGI server (e.g. uvicorn, hypercorn) + and is configured to run on port 8000 by default. +""" + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from src.utils.logging_config import get_logger +from src.api.endpoints import router as api_router +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Get logger +logger = get_logger(__name__) + + +app = FastAPI( + title="Citation Credibility Service", + description="API for calculating credibility scores of academic sources", + version="1.0.0", +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Include versioned routers +app.include_router(api_router, prefix="/api/v1") + +# Health check endpoint +@app.get("/") +async def root(): + return {"message": "Citation Credibility Service is running"} + +# Uncomment to run via uvicorn directly +# if __name__ == "__main__": +# import uvicorn +# port = int(os.environ.get("PORT", 8000)) +# uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True)