From 8c8ae4e64bb4f002aac5116f927a646cc0df069f Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 16:14:04 -0400 Subject: [PATCH 01/11] tracking frontend only --- .github/workflows/main.yml | 74 -- .gitignore | 6 + backend/mainService/.env.example | 50 - backend/mainService/Dockerfile | 38 - backend/mainService/__init__.py | 0 backend/mainService/app.py | 34 - backend/mainService/pytest.ini | 2 - backend/mainService/requirements-test.txt | 4 - backend/mainService/requirements.txt | 30 - backend/mainService/src/__init__.py | 0 backend/mainService/src/config/__init__.py | 0 .../src/config/async_http_session.py | 23 - backend/mainService/src/config/config.py | 146 --- backend/mainService/src/config/log_config.py | 65 -- .../src/config/playwright_driver.py | 270 ----- backend/mainService/src/config/startup.py | 31 - .../mainService/src/controllers/__init__.py | 0 .../src/controllers/citation_controller.py | 79 -- .../src/controllers/health_controller.py | 14 - .../src/custom_exceptions/__init__.py | 0 .../src/custom_exceptions/api_exceptions.py | 10 - .../src/custom_exceptions/llm_exceptions.py | 13 - .../llm/Async_prepare_data_for_embedding.py | 288 ----- backend/mainService/src/llm/Instructions.py | 65 -- backend/mainService/src/llm/Pinecone.py | 486 -------- backend/mainService/src/llm/__init__.py | 0 .../mainService/src/llm/chat_llm/Azure_llm.py | 213 ---- .../src/llm/chat_llm/Gemini_llm.py | 47 - .../mainService/src/llm/chat_llm/Groq_llm.py | 80 -- .../mainService/src/llm/chat_llm/__init__.py | 0 .../src/llm/embedding_utils/reranker.py | 40 - .../src/llm/embedding_utils/vector_embed.py | 85 -- backend/mainService/src/models/schema.py | 46 - backend/mainService/src/scraper/__init__.py | 0 .../src/scraper/async_base_scraper.py | 149 --- .../src/scraper/async_content_scraper.py | 214 ---- .../src/scraper/async_searchApi.py | 152 --- .../src/scraper/site_specific/__init__.py | 0 .../site_specific/async_frontier_scraper.py | 98 -- .../site_specific/async_generic_scraper.py | 112 -- .../site_specific/async_ibm_scraper.py | 88 -- backend/mainService/src/services/__init__.py | 0 .../src/services/citation_service.py | 425 ------- .../source_credibility_metric_service.py | 126 -- .../src/utils/concurrent_resources.py | 15 - backend/mainService/src/utils/file_utils.py | 60 - .../src/utils/format_rerank_result.py | 118 -- backend/mainService/src/utils/web_utils.py | 78 -- backend/mainService/test/conftest.py | 50 - .../custom_exceptions/test_llm_exceptions.py | 16 - .../test/test_citation_controller.py | 91 -- .../mainService/test/test_citation_service.py | 1019 ----------------- .../test/test_health_controller.py | 8 - .../test_source_credibility_metric_service.py | 140 --- backend/metricsService/.env.example | 5 - backend/metricsService/Dockerfile | 30 - backend/metricsService/README.md | Bin 5238 -> 0 bytes backend/metricsService/main.py | 67 -- backend/metricsService/pytest.ini | 2 - backend/metricsService/requirements-test.txt | 5 - backend/metricsService/requirements.txt | 10 - backend/metricsService/src/__init__.py | 0 backend/metricsService/src/api/endpoints.py | 156 --- backend/metricsService/src/models/schemas.py | 71 -- .../src/services/author_reputation.py | 170 --- .../src/services/citation_data.py | 92 -- .../src/services/credibility_service.py | 132 --- .../src/services/domain_reputation.py | 152 --- .../src/services/journal_impact.py | 136 --- .../src/services/recency_score.py | 69 -- .../metricsService/src/utils/api_config.py | 37 - backend/metricsService/src/utils/api_utils.py | 54 - backend/metricsService/src/utils/cache.py | 118 -- .../metricsService/src/utils/cache_config.py | 50 - .../src/utils/logging_config.py | 80 -- .../tests/api/test_endpoints.py | 142 --- backend/metricsService/tests/conftest.py | 20 - .../tests/models/test_schemas.py | 85 -- .../services/test_credibility_service.py | 310 ----- .../tests/utils/test_api_utils.py | 47 - .../metricsService/tests/utils/test_cache.py | 79 -- docker-compose.yml | 36 - frontend/vercel.json | 6 + 83 files changed, 12 insertions(+), 7347 deletions(-) delete mode 100644 .github/workflows/main.yml delete mode 100644 backend/mainService/.env.example delete mode 100644 backend/mainService/Dockerfile delete mode 100644 backend/mainService/__init__.py delete mode 100644 backend/mainService/app.py delete mode 100644 backend/mainService/pytest.ini delete mode 100644 backend/mainService/requirements-test.txt delete mode 100644 backend/mainService/requirements.txt delete mode 100644 backend/mainService/src/__init__.py delete mode 100644 backend/mainService/src/config/__init__.py delete mode 100644 backend/mainService/src/config/async_http_session.py delete mode 100644 backend/mainService/src/config/config.py delete mode 100644 backend/mainService/src/config/log_config.py delete mode 100644 backend/mainService/src/config/playwright_driver.py delete mode 100644 backend/mainService/src/config/startup.py delete mode 100644 backend/mainService/src/controllers/__init__.py delete mode 100644 backend/mainService/src/controllers/citation_controller.py delete mode 100644 backend/mainService/src/controllers/health_controller.py delete mode 100644 backend/mainService/src/custom_exceptions/__init__.py delete mode 100644 backend/mainService/src/custom_exceptions/api_exceptions.py delete mode 100644 backend/mainService/src/custom_exceptions/llm_exceptions.py delete mode 100644 backend/mainService/src/llm/Async_prepare_data_for_embedding.py delete mode 100644 backend/mainService/src/llm/Instructions.py delete mode 100644 backend/mainService/src/llm/Pinecone.py delete mode 100644 backend/mainService/src/llm/__init__.py delete mode 100644 backend/mainService/src/llm/chat_llm/Azure_llm.py delete mode 100644 backend/mainService/src/llm/chat_llm/Gemini_llm.py delete mode 100644 backend/mainService/src/llm/chat_llm/Groq_llm.py delete mode 100644 backend/mainService/src/llm/chat_llm/__init__.py delete mode 100644 backend/mainService/src/llm/embedding_utils/reranker.py delete mode 100644 backend/mainService/src/llm/embedding_utils/vector_embed.py delete mode 100644 backend/mainService/src/models/schema.py delete mode 100644 backend/mainService/src/scraper/__init__.py delete mode 100644 backend/mainService/src/scraper/async_base_scraper.py delete mode 100644 backend/mainService/src/scraper/async_content_scraper.py delete mode 100644 backend/mainService/src/scraper/async_searchApi.py delete mode 100644 backend/mainService/src/scraper/site_specific/__init__.py delete mode 100644 backend/mainService/src/scraper/site_specific/async_frontier_scraper.py delete mode 100644 backend/mainService/src/scraper/site_specific/async_generic_scraper.py delete mode 100644 backend/mainService/src/scraper/site_specific/async_ibm_scraper.py delete mode 100644 backend/mainService/src/services/__init__.py delete mode 100644 backend/mainService/src/services/citation_service.py delete mode 100644 backend/mainService/src/services/source_credibility_metric_service.py delete mode 100644 backend/mainService/src/utils/concurrent_resources.py delete mode 100644 backend/mainService/src/utils/file_utils.py delete mode 100644 backend/mainService/src/utils/format_rerank_result.py delete mode 100644 backend/mainService/src/utils/web_utils.py delete mode 100644 backend/mainService/test/conftest.py delete mode 100644 backend/mainService/test/custom_exceptions/test_llm_exceptions.py delete mode 100644 backend/mainService/test/test_citation_controller.py delete mode 100644 backend/mainService/test/test_citation_service.py delete mode 100644 backend/mainService/test/test_health_controller.py delete mode 100644 backend/mainService/test/test_source_credibility_metric_service.py delete mode 100644 backend/metricsService/.env.example delete mode 100644 backend/metricsService/Dockerfile delete mode 100644 backend/metricsService/README.md delete mode 100644 backend/metricsService/main.py delete mode 100644 backend/metricsService/pytest.ini delete mode 100644 backend/metricsService/requirements-test.txt delete mode 100644 backend/metricsService/requirements.txt delete mode 100644 backend/metricsService/src/__init__.py delete mode 100644 backend/metricsService/src/api/endpoints.py delete mode 100644 backend/metricsService/src/models/schemas.py delete mode 100644 backend/metricsService/src/services/author_reputation.py delete mode 100644 backend/metricsService/src/services/citation_data.py delete mode 100644 backend/metricsService/src/services/credibility_service.py delete mode 100644 backend/metricsService/src/services/domain_reputation.py delete mode 100644 backend/metricsService/src/services/journal_impact.py delete mode 100644 backend/metricsService/src/services/recency_score.py delete mode 100644 backend/metricsService/src/utils/api_config.py delete mode 100644 backend/metricsService/src/utils/api_utils.py delete mode 100644 backend/metricsService/src/utils/cache.py delete mode 100644 backend/metricsService/src/utils/cache_config.py delete mode 100644 backend/metricsService/src/utils/logging_config.py delete mode 100644 backend/metricsService/tests/api/test_endpoints.py delete mode 100644 backend/metricsService/tests/conftest.py delete mode 100644 backend/metricsService/tests/models/test_schemas.py delete mode 100644 backend/metricsService/tests/services/test_credibility_service.py delete mode 100644 backend/metricsService/tests/utils/test_api_utils.py delete mode 100644 backend/metricsService/tests/utils/test_cache.py delete mode 100644 docker-compose.yml create mode 100644 frontend/vercel.json diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 1038a47..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,74 +0,0 @@ -name: Python CI/CD - -env: - GPSE_API_KEY: ArzaSdB1_DhfPRKfJMdY6dp8duWdQTKQdC2xxkwc - GROQ_API_KEY: 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 - CX: 3afe68fe44e8d4620 - MIXBREAD_API_KEY: emb_f838d0f14ue7d61907d7f28fd643s8eaf49c0da2wf32f22a - PINECONE_API_KEY: pcsk_7LufHa_aUYWm5r5WwF1LBhfujiKftHWLX9iU6fyYxtkDukMnZZQKMWQJcXrFmhzt7GtVtJ - AZURE_API_KEY: 1JCm7aFbY2zVyXndOwAaljohGFAeFKjvwmDLa200gjSdlsLOqP3yJQQJ99BBACREanaXJ3w3AbgAACOG2ZyA - GOOGLE_API_KEY: Adzac4B4-q3u3Q_lssqr_dc7k-WM28ygszsVrIe - CREDIBILITY_API_URL: https://credibility-api.example.com - SERVERLESS: FALSE - -on: - push: - branches: [ main ] - pull_request: - branches: [ main ] - -# Add permissions configuration -permissions: - pull-requests: write - issues: write - -jobs: - test: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.11' - - - name: Install dependencies for mainService - run: | - cd backend/mainService - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-test.txt - - - name: Install dependencies for metricService - run: | - cd backend/metricsService - python -m pip install --upgrade pip - pip install -r requirements.txt - pip install -r requirements-test.txt - - - name: Run tests - run: | - cd backend/mainService - python -m pytest - cd ../metricsService - python -m pytest - - - name: Close failed PR - if: failure() && github.event_name == 'pull_request' - uses: actions/github-script@v6 - with: - script: | - await github.rest.pulls.update({ - owner: context.repo.owner, - repo: context.repo.repo, - pull_number: context.issue.number, - state: 'closed' - }); - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: 'This PR was automatically closed because the CI pipeline failed.' - }); \ No newline at end of file diff --git a/.gitignore b/.gitignore index 1fbafed..3b71319 100644 --- a/.gitignore +++ b/.gitignore @@ -58,5 +58,11 @@ tempCodeRunnerFile.py unit_test.py testing_workflow.py *.yaml +local.settings.json +playwright_browser +__pycache__ +backend/ +.github/ +docker-compose.yml scripts/ diff --git a/backend/mainService/.env.example b/backend/mainService/.env.example deleted file mode 100644 index fcbdac3..0000000 --- a/backend/mainService/.env.example +++ /dev/null @@ -1,50 +0,0 @@ -CX= # your google programmable custom search engine id -GPSE_API_KEY= # your google programmable #search engine api key -GROQ_API_KEY= # your groq api key -GOOGLE_API_KEY= # your gemini google api key -MIXBREAD_API_KEY= # your mixbread api key -PINECONE_API_KEY= # your pinecone api key -AZURE_MODELS_ENDPOINT = # your azure model endpoint for citation generation -CREDIBILITY_API_URL = # your credibility api url -SERVERLESS=FALSE # set to TRUE if you are using serverless mode, else set to FALSE - - -#NOTE: -# CREDIBILITY_API_URL is the url of the credibility api that is used to get the credibility metrics for the sources -# CREDIBILITY_API_URL is optional and is only used if the CREDIBILITY_API_URL environment variable is set -# If the CREDIBILITY_API_URL environment variable is not set, the credibility metrics will not be fetched - - -#AZURE_MODELS_ENDPOINT is the endpoint of the azure model that is used for citation generation -#AZURE_MODELS_ENDPOINT is required and is used to generate the citations for the sources - - -#MIXBREAD_API_KEY is the api key of the mixbread api that is used to rerank the sources -#MIXBREAD_API_KEY is required and is used to rerank the sources - - -#PINECONE_API_KEY is the api key of the pinecone api that is used to store the embeddings of the sources -#PINECONE_API_KEY is required and is used to store the embeddings of the sources - - -#GPSE_API_KEY is the api key of the google programmable search engine api that is used to search the web -#GPSE_API_KEY is required and is used to search the web - - -#GOOGLE_API_KEY is the api key for gemini google api -#GOOGLE_API_KEY it is required and is used to merge the chunk of cited citations returned by the azure model - -#CX is the custom search engine id for google programmable search engine - -#All the above can be replaced by writing your own functions for the respective services -#for instance, one could decide to use gemini to generate the intext citation and references rather than using an azure -#hosted model. Hence all you need to do is write your own cite function/module and replace the azure cite function in the citation service file - - - - - - - - - diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile deleted file mode 100644 index 084364c..0000000 --- a/backend/mainService/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -# Installs essential tools for compiling software from source, often needed for Python package dependencies.(build-essential) -# Removes the package lists downloaded during the update to reduce the image size. -RUN apt-get update && apt-get install -y \ - build-essential \ - cron \ - && rm -rf /var/lib/apt/lists/* - -# Set the PATH environment variable to include /app -ENV PATH="/app:${PATH}" - -# Copy requirements first to leverage Docker cache -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the source code -COPY ./scripts/ /app/scripts/ -COPY ./src/ /app/src/ -COPY ./app.py /app/app.py -COPY ./__init__.py /app/__init__.py - -# Create a directory for runtime configuration -RUN mkdir -p /app/config - -# Install playwright -RUN playwright install && playwright install-deps - -# Expose the port the app runs on -EXPOSE 8000 - -# Start both cron and the FastAPI application -CMD ["sh", "-c", "cron && uvicorn app:app --host 0.0.0.0 --port 8000"] \ No newline at end of file diff --git a/backend/mainService/__init__.py b/backend/mainService/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/app.py b/backend/mainService/app.py deleted file mode 100644 index 5dab465..0000000 --- a/backend/mainService/app.py +++ /dev/null @@ -1,34 +0,0 @@ -import os -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from src.config.startup import startup_event -from src.controllers.citation_controller import router as citation_router -from src.controllers.health_controller import router as health_router - -# Detect if running in Azure Functions (serverless) -IS_SERVERLESS = os.getenv("SERVERLESS").lower() == "true" - -origins = [ - "http://localhost:5173", # Frontend running on localhost (React, Vue, etc.) - "https://cite-me.vercel.app" -] - -# Conditionally assign lifespan -lifespan = startup_event if not IS_SERVERLESS else None - -# Create FastAPI instance -app = FastAPI(title="Citation API", version="1.0.0", lifespan=lifespan) - -# Middleware configuration -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["POST", "GET", "OPTIONS", "HEAD"], - allow_headers=["*"], -) - -# Include routers -app.include_router(health_router, tags=["Health"]) -app.include_router(citation_router, prefix="/citation", tags=["Citation"]) - diff --git a/backend/mainService/pytest.ini b/backend/mainService/pytest.ini deleted file mode 100644 index 0102b0a..0000000 --- a/backend/mainService/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_default_fixture_loop_scope = function diff --git a/backend/mainService/requirements-test.txt b/backend/mainService/requirements-test.txt deleted file mode 100644 index cd9c012..0000000 --- a/backend/mainService/requirements-test.txt +++ /dev/null @@ -1,4 +0,0 @@ -pytest==8.3.5 -pytest-asyncio==0.26.0 -pytest-cov==4.1.0 -pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/mainService/requirements.txt b/backend/mainService/requirements.txt deleted file mode 100644 index a5189a8..0000000 --- a/backend/mainService/requirements.txt +++ /dev/null @@ -1,30 +0,0 @@ -aiohttp==3.11.12 -azure-ai-inference==1.0.0b8 -azure-core==1.32.0 -azure-identity==1.20.0 -fastapi==0.115.11 -groq==0.20.0 -langchain==0.3.21 -langchain_community==0.3.20 -langchain_core==0.3.47 -mixedbread_ai==2.2.6 -nltk==3.9.1 -pinecone==6.0.2 -playwright==1.50.0 -Protego==0.4.0 -protobuf==6.30.1 -pydantic==2.10.6 -pytest==8.3.4 -python-dotenv==1.0.1 -reportlab==4.3.0 -Requests==2.32.3 -tenacity==9.0.0 -urllib3==2.3.0 -lxml==5.3.0 -google-genai -redis>=4.2.0 -uvicorn -httpx>=0.28.1 -pypdf -pypdf2 - diff --git a/backend/mainService/src/__init__.py b/backend/mainService/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/config/__init__.py b/backend/mainService/src/config/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/config/async_http_session.py b/backend/mainService/src/config/async_http_session.py deleted file mode 100644 index 9a4f37e..0000000 --- a/backend/mainService/src/config/async_http_session.py +++ /dev/null @@ -1,23 +0,0 @@ -from aiohttp import ClientSession - - -class AsyncHTTPClient: - """Manages a shared aiohttp session.""" - session = None - - @classmethod - async def init_session(cls): - if cls.session is None: - cls.session = ClientSession() - - @classmethod - async def close_session(cls): - if cls.session: - await cls.session.close() - cls.session = None - - @classmethod - async def getSession(cls) -> ClientSession: - if cls.session is None: - await cls.init_session() - return cls.session diff --git a/backend/mainService/src/config/config.py b/backend/mainService/src/config/config.py deleted file mode 100644 index 7ca0a07..0000000 --- a/backend/mainService/src/config/config.py +++ /dev/null @@ -1,146 +0,0 @@ -from typing import Dict, Any -from dataclasses import dataclass, field -import os - - -@dataclass -class ScraperConfig: - """Configuration class for web scraping settings. - - Contains various settings and parameters used for configuring web scraping behavior.""" - """Configuration for web scraping operations.""" - - # HTTP Headers - HTTP_HEADERS: Dict[str, str] = field(default_factory=lambda: { - "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", - "SEC_CH_UA": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", - "SEC_CH_UA_MOBILE": "?0", - "SEC_FETCH_SITE": "cross-site", - "ACCEPT_ENCODING": "gzip, deflate, br, zstd", - }) - - # File and Processing Limits - # This is the maximum size of a file that we are willing to download and - # process as a source document for citation. - MB = 1024 * 1024 - - """ - This is the maximum size of a file that we are willing to download and process as a source document for citation. - """ - MAX_FILE_SIZE: int = 3 * MB # 3 MB - - """ - This is the timeout duration for the requests made to the web scraper - """ - TIMEOUT_DURATION: int = 10000 - - def __post_init__(self): - if self.MAX_FILE_SIZE <= 0: - raise ValueError("MAX_FILE_SIZE must be positive") - if self.TIMEOUT_DURATION <= 0: - raise ValueError("TIMEOUT_DURATION must be positive") - - -@dataclass -class LlmConfig: - """Configuration class for LLM and embedding settings. - - Contains settings related to LLM models, embeddings, and related parameters.""" - - """ - This is the number of tokens that the source documents are split into to generate their embeddings. - 507 is the maximum number of tokens that can be processed by the multilingual-e5-large model. - 380 seem to be a good balance between how large each chunk should be in order to reduce the number of request made to pinecone for embeddings, - """ - MAX_TOKENS: int = 380 - - """ - This is the number of tokens that the query(the user's thesis) is split into to generate its embeddings. - half of the MAX_TOKENS is a good balance between how large each chunk should be in order to reduce the number of request made to pinecone for embeddings, - as well as being as accurate as possible for the eventual intext citation. - """ - QUERY_TOKEN_SIZE: int = MAX_TOKENS // 1.5 # if using mixbread remember the max token for a query is 250 - - """ - This is the percentage of overlap between the chunks of the source documents. - 15 was an arbitrary number that seemed to work well. - Feel free to experiment with this value to see what works best for your use case. - """ - DEFAULT_OVERLAP_PERCENT: int = 10 - - """ - This is the number of tokens that are processed in a single batch. - Each request to pinecone for embeddings is limited to 90 because the inference API can only handle 96 tokens in one batch request and each token - can have a maximum size of 507. - """ - BATCH_SIZE: int = 90 - - """ - This is the maximum character lenght our pincone index name can be. - """ - INDEX_NAME_LEN: int = 42 - - """ - This is the number of documents that are processed in a single batch. - """ - UPSERT_BATCH_SIZE: int = 1000 - - """ - This is the llm that open router uses for generating the intext citation and reference list for each query - """ - OPEN_ROUTER_MODEL: str = "meta-llama/llama-3.3-70b-instruct:free" - - """ - This is the azure model api endpoint - """ - - -# Concurrency and Performance -@dataclass -class ConcurrencyConfig: - """Configuration class for concurrency settings.""" - - # General concurrency settings - DEFAULT_CONCURRENT_WORKERS: int = (os.cpu_count() // 2) + 1 - HANDLE_INDEX_DELETE_WORKERS: int = 2 - - # Credibility service specific settings - CREDIBILITY_MAX_THREADS: int = 4 # Maximum threads for credibility calculations - CREDIBILITY_MAX_CONCURRENT: int = 8 # Maximum concurrent operations - CREDIBILITY_BATCH_SIZE: int = 4 # Size of processing batches - - -@dataclass -class ModelConfig: - """Configuration class for AI model settings. - - Contains settings specific to AI models and their deployment.""" - """Configuration for ML models and APIs.""" - - MODEL_ID: str = "BAAI/bge-m3" - MODEL_API_URL: str = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{MODEL_ID}" - - # LLM Generation Parameters - DEFAULT_TEMPERATURE: float = 0.5 - DEFAULT_TOP_P: float = 1.0 - DEFAULT_MAX_TOKENS: int = 1024 - - -@dataclass -class SearchConfig: - """Configuration class for search settings. - - Contains settings related to search functionality and parameters.""" - """Configuration for search-related operations.""" - - DATE_RESTRICT: str = "y5" - TOP_N: int = 5 - SEARCH_URL: str = "https://www.googleapis.com/customsearch/v1?key={API_KEY}&cx={CX}&q={query}&dateRestrict={DATE_RESTRICT}&num={TOP_N}" - - -# Main configuration object -scraper_config = ScraperConfig() -concurrency_config = ConcurrencyConfig() -model_config = ModelConfig() -search_config = SearchConfig() -LlmConfig = LlmConfig() diff --git a/backend/mainService/src/config/log_config.py b/backend/mainService/src/config/log_config.py deleted file mode 100644 index 63baee3..0000000 --- a/backend/mainService/src/config/log_config.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Logging Configuration Module - -This module handles the configuration of the application's logging system. -It sets up both file(optional) and stream handlers with a standardized format for -consistent logging throughout the application. - -Key Functions: -- get_logger: Returns a configured logger instance - -Configuration: -- Log level: INFO -- Log format: Timestamp - Logger Name - Level - Message -- Handlers: File handler - -Features: -- Centralized logging configuration -- Easy logger instance creation -- Both file and stream output -- Standardized log format -""" -import os -import logging -from datetime import datetime -from typing import Optional -from logging import Logger - - -def setup_logging( - log_level=logging.INFO, - log_dir: str = 'logs', - filename: Optional[str] = 'log', - logToFile: Optional[bool] = False, - ) -> Logger: - - """ - Set up a standardized logging configuration for the entire project. - - Args: - log_level (int): Logging level (default: logging.INFO) - log_dir (str): Directory to store log files (default: 'logs') - filename (str): Base filename for log files (default: 'log') - logToFile (bool): Whether to log to file (default: False) - """ - # Create a unique log filename with timestamp - timestamp = datetime.now().strftime("%Y%m%U") - - # Configure logging - logging.basicConfig( - level=log_level, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler() # Also log to console - ] - ) - logger = logging.getLogger(filename) - - if logToFile: - # Ensure logs directory exists - os.makedirs(log_dir, exist_ok=True) - log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') - logger.addHandler(logging.FileHandler(log_filename)) - - - return logger diff --git a/backend/mainService/src/config/playwright_driver.py b/backend/mainService/src/config/playwright_driver.py deleted file mode 100644 index 1eb5e28..0000000 --- a/backend/mainService/src/config/playwright_driver.py +++ /dev/null @@ -1,270 +0,0 @@ -from playwright.async_api import async_playwright, Playwright, Browser, BrowserContext, Page, Route, Request -from typing import List -import threading -import asyncio -import os -from src.config.config import scraper_config -from src.config.log_config import setup_logging - -""" -A singleton wrapper module for Playwright browser automation that provides managed browser contexts and pages. - -This module implements a thread-safe singleton pattern for managing Playwright browser instances, -contexts, and pages with built-in stealth mode capabilities. It handles browser lifecycle management -and provides methods for creating and managing browser contexts and pages with custom configurations. - -Classes: - PlaywrightDriver: A singleton class that manages Playwright browser instances and contexts. - -Features: - - Thread-safe singleton browser instance management - - Async-compatible browser initialization and operations - - Automatic stealth mode implementation for pages - - Custom header injection for all requests - - Multiple browser context support - - Managed browser lifecycle (initialization, context creation, cleanup) - -Example: - async def main(): - driver = await PlaywrightDriver.create() - try: - context = await driver.get_new_context() - page = await driver.get_new_page(context) - await page.goto("https://example.com") - finally: - await driver.quit() - -Dependencies: - - playwright.async_api - - playwright_stealth - - threading - - asyncio - -Note: - This implementation uses Chromium as the default browser with specific - arguments to disable various features that might expose automation. -""" - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - - -class PlaywrightDriver: - """ - A singleton class that manages Playwright browser instances and contexts. - - This class provides thread-safe browser management with stealth capabilities - and custom header injection. It ensures only one browser instance exists - across the application. - - Attributes: - _instance (PlaywrightDriver): Singleton instance of the class - _playwright (Playwright): Playwright instance - _browser (Browser): Browser instance - _contexts (List[BrowserContext]): List of active browser contexts - _current_context (BrowserContext): Currently active browser context - """ - - _instance = None - _playwright: Playwright = None - _browser: Browser = None - _lock = threading.Lock() - _async_lock = asyncio.Lock() - _contexts: List[BrowserContext] = [] - _current_context: BrowserContext = None - - def __new__(cls): - with cls._lock: - if not hasattr(cls, 'instance'): - cls.instance: PlaywrightDriver = super().__new__(cls) - cls.instance._browser = None - return cls.instance - - def __init__(self): - pass - - @classmethod - async def create(cls): - """ - Factory method for creating or retrieving the singleton instance. - - Returns: - PlaywrightDriver: Singleton instance of the PlaywrightDriver class - - Example: - driver = await PlaywrightDriver.create() - """ - - # Factory method for async initialization - if not cls._instance: - cls._instance = cls() - await cls._instance.__initialize_browser() - return cls._instance - - async def __initialize_browser(self) -> Browser: - """ - Initialize the browser instance with custom configurations. - - Returns: - Browser: Configured browser instance - - Raises: - Exception: If browser initialization fails - - Note: - Configures browser with specific arguments to disable automation detection - """ - - async with self._async_lock: - if self._browser: - return self._browser - - args = [ - "--disable-gpu", - "--disable-extensions", - "--disable-infobars", - "--disable-software-rasterizer", - "--disable-dns-prefetch", - '--disable-notification' - "--disable-blink-features=AutomationControlled", - ] - try: - self._playwright = await async_playwright().start() - self._browser = await self._playwright.chromium.launch(headless=True, args=args) - except Exception as e: - logger.critical(f"Error while initializing browser: {e}") - raise e - return self._browser - - async def get_new_context(self) -> BrowserContext: - """ - Create and return a new browser context. - - Returns: - BrowserContext: New browser context with downloads enabled - - Note: - Automatically adds the new context to internal context tracking - """ - - if not self._browser: - await self.__initialize_browser() - context = await self._browser.new_context(accept_downloads=True) - self._contexts.append(context) - self._current_context = context - return context - - async def get_browser(self) -> Playwright: - """ - Get the current browser instance, initializing it if necessary. - - Returns: - Playwright: Current browser instance - """ - - if not self._browser: - self._browser = await self.__initialize_browser() - return self._browser - - async def close_browser(self): - """ - Close all browser contexts and the browser instance. - - Closes all active contexts before closing the browser instance. - - Raises: - Exception: If there's an error during browser closure - """ - - try: - if self._browser: - for context in self._contexts: - await context.close() - await self._browser.close() - except Exception as e: - logger.exception(f"Error while closing browser: {e}") - - async def get_new_page(self, context: BrowserContext) -> Page: - """ - Create a new page in the specified browser context with stealth mode. - - Args: - context (BrowserContext): Browser context to create the page in - - Returns: - Page: New page instance with stealth mode and custom routing enabled - """ - - page = await context.new_page() - await page.route("**/*", self.handle) - return page - - async def handle(self, route: Route, request: Request): - """ - Handle browser requests by injecting custom headers. - - Args: - route (Route): Playwright route object - request (Request): Request being made - - Note: - Injects custom headers defined in scraper_config.HTTP_HEADERS - """ - - # override headers - headers = { - **request.headers, - **scraper_config.HTTP_HEADERS, - "SEC_CH_UA": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", - } - await route.continue_(headers=headers) - - async def quit(self): - """ - Clean up all browser resources. - - Closes the browser and stops the playwright instance. - - Raises: - Exception: If there's an error during cleanup - """ - - try: - if self._browser: - await self.close_browser() - if self._playwright: - await self._playwright.stop() - except Exception as e: - logger.exception(f"Error while quitting driver: {e}") - - async def get_context(self) -> BrowserContext: - """ - Get the current context or create a new one if none exists. - - Returns: - BrowserContext: Current or new browser context - """ - - if not self._contexts: - return await self.get_new_context() - return self._current_context - - async def get_current_context(self) -> BrowserContext | None: - """ - Get the currently active browser context. - - Returns: - BrowserContext | None: Current browser context or None if no context exists - """ - - return self._current_context - - async def set_current_context(self, context: BrowserContext): - """ - Set the current active browser context. - - Args: - context (BrowserContext): Browser context to set as current - """ - - self._current_context = context diff --git a/backend/mainService/src/config/startup.py b/backend/mainService/src/config/startup.py deleted file mode 100644 index 807310a..0000000 --- a/backend/mainService/src/config/startup.py +++ /dev/null @@ -1,31 +0,0 @@ -from src.llm.Pinecone import PineconeOperations -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.llm.chat_llm.Azure_llm import Citation -from dotenv import load_dotenv -from src.scraper.async_content_scraper import AsyncContentScraper -import nltk -from src.utils.concurrent_resources import cleanup_resources -from contextlib import asynccontextmanager -from src.config.playwright_driver import PlaywrightDriver as ASD -from src.config.async_http_session import AsyncHTTPClient -from fastapi import FastAPI - -@asynccontextmanager -async def startup_event(app: FastAPI): - load_dotenv() - nltk.download('punkt') - nltk.download('punkt_tab') - - app.state.playwright_driver = await ASD.create() - app.state.pc = await PineconeOperations.create() - app.state.summarize_llm = Summarize_llm() - app.state.citation_llm = Citation() - # Initialize the async content scraper using its async context manager - async with AsyncContentScraper(playwright_driver=app.state.playwright_driver) as content_scraper: - app.state.async_content_scraper = content_scraper - yield - # Exiting the async with block automatically calls __aexit__ - await app.state.playwright_driver.quit() - await app.state.pc.cleanup() - await AsyncHTTPClient.close_session() - cleanup_resources() # Clean up thread pool and other concurrent resources diff --git a/backend/mainService/src/controllers/__init__.py b/backend/mainService/src/controllers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/controllers/citation_controller.py b/backend/mainService/src/controllers/citation_controller.py deleted file mode 100644 index e6662d6..0000000 --- a/backend/mainService/src/controllers/citation_controller.py +++ /dev/null @@ -1,79 +0,0 @@ -from fastapi import APIRouter, Form, Request, status -from fastapi.responses import JSONResponse -from typing import Dict, Any -from src.services.citation_service import CitationService -from src.config.log_config import setup_logging -import os -from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError -from src.models.schema import CitationInput - - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - -router = APIRouter() - - -@router.post("/get_citation", status_code=status.HTTP_200_OK) -async def get_citation( - request: Request, payload: CitationInput) -> Dict[str, Any]: - """Generate citations for the provided content. - - Args: - request (Request): FastAPI request object containing app state - title (str): Title of the content - content (str): Text content to generate citations for - format (str): Citation format (e.g., "APA", "MLA") - - Returns: - Dict[str, Any]: Generated citations and metadata - """ - citation_service = CitationService( - PC=request.app.state.pc, - summarize_llm=request.app.state.summarize_llm, - citation_llm=request.app.state.citation_llm, - scraper=request.app.state.async_content_scraper) - try: - title = payload.title - content = payload.content - citation_style = payload.citationStyle or "APA" - - # Handle each form type accordingly - if payload.formType == "auto": - result = await citation_service.process_citation( - title, content, form_type=payload.formType, style=citation_style - ) - elif payload.formType == "web": - result = await citation_service.process_citation( - title, content, form_type=payload.formType, style=citation_style, - sources=payload.sources, supplement_urls=payload.supplementUrls - ) - elif payload.formType == "source": - result = await citation_service.process_citation( - title, content, form_type=payload.formType, style=citation_style, - sources=payload.sources - ) - else: - return JSONResponse( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content={"error": "Invalid formType"} - ) - - if not result: - return JSONResponse( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - content={"error": "Cannot generate citation at this time"} - ) - return result - except SearchKeyGenerationError as e: - logger.error(f"Error in processing citation: {e}") - return JSONResponse( - status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content={"error": "Title required to process citation at this time"} - ) - except Exception as e: - logger.exception(f"Error in processing citation: {e}") - return JSONResponse( - status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, - content={"error": "Internal Server Error"} - ) diff --git a/backend/mainService/src/controllers/health_controller.py b/backend/mainService/src/controllers/health_controller.py deleted file mode 100644 index b6d5f00..0000000 --- a/backend/mainService/src/controllers/health_controller.py +++ /dev/null @@ -1,14 +0,0 @@ -from fastapi import APIRouter -from typing import Dict - -router = APIRouter() - - -@router.get("/health") -async def health_check() -> Dict[str, str]: - """Check the health status of the API. - - Returns: - Dict[str, str]: Health status indicating the API is running - """ - return {"status": "Healthy"} diff --git a/backend/mainService/src/custom_exceptions/__init__.py b/backend/mainService/src/custom_exceptions/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/custom_exceptions/api_exceptions.py b/backend/mainService/src/custom_exceptions/api_exceptions.py deleted file mode 100644 index f6901b6..0000000 --- a/backend/mainService/src/custom_exceptions/api_exceptions.py +++ /dev/null @@ -1,10 +0,0 @@ -class MissingApiKeyException(Exception): - pass - - -class InvalidApiKeyException(Exception): - pass - - -class MissingEndpointException(Exception): - pass diff --git a/backend/mainService/src/custom_exceptions/llm_exceptions.py b/backend/mainService/src/custom_exceptions/llm_exceptions.py deleted file mode 100644 index 912cf35..0000000 --- a/backend/mainService/src/custom_exceptions/llm_exceptions.py +++ /dev/null @@ -1,13 +0,0 @@ -class SearchKeyGenerationError(Exception): - """Custom exception class for LLM-related errors""" - pass - - -class CitationGenerationError(SearchKeyGenerationError): - """Custom exception class for LLM-related errors""" - pass - - -class SearchKeyGenerationError(SearchKeyGenerationError): - """Custom exception class for LLM-related errors""" - pass diff --git a/backend/mainService/src/llm/Async_prepare_data_for_embedding.py b/backend/mainService/src/llm/Async_prepare_data_for_embedding.py deleted file mode 100644 index 3cbe549..0000000 --- a/backend/mainService/src/llm/Async_prepare_data_for_embedding.py +++ /dev/null @@ -1,288 +0,0 @@ -from langchain_community.document_loaders import PyPDFLoader -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.schema import Document -from typing import List, Optional, Dict -import os -import nltk -from concurrent.futures import ThreadPoolExecutor -from src.config.config import concurrency_config, LlmConfig as LLMEC -import glob -from src.config.log_config import setup_logging - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - -shared_executor = ThreadPoolExecutor( - max_workers=concurrency_config.DEFAULT_CONCURRENT_WORKERS) - - -def load_document(docs_path: str) -> List[Document]: - """Load PDF documents from a directory path. - - Args: - docs_path (str): Path to directory containing PDF files - - Returns: - List[Document]: List of loaded document objects - - Raises: - FileNotFoundError: If the provided path does not exist - """ - if not os.path.exists(docs_path): - raise FileNotFoundError(f"{docs_path} does not exist") - - # List PDF files in a sorted order (alphabetically, for example) - pdf_paths = sorted( - glob.glob( - os.path.join( - docs_path, - "**/*.pdf"), - recursive=True)) - all_docs = [] - - for pdf in pdf_paths: - try: - loader = PyPDFLoader(pdf) - docs = loader.load() # Each document represents one page. - except Exception as e: - logger.exception(f"Error loading {pdf}") - continue - - # Sort pages by metadata (if available) - docs.sort(key=lambda doc: doc.metadata.get("page_number", 0)) - - # Now filter pages: after encountering "conclusion", stop at the first - # page that mentions "reference" or "bibliography" - filtered_pages = [] - conclusion_found = False - for page in docs: - content = page.page_content.lower() - if not conclusion_found and "conclusion" in content: - conclusion_found = True - # Once we've passed conclusion, if a page signals references, stop - # processing further pages - if conclusion_found and ( - "reference" in content or "bibliography" in content): - break - filtered_pages.append(page) - - all_docs.extend(filtered_pages) - - return all_docs - - -def split_document( - documents: List[Document], - max_tokens: Optional[int] = LLMEC.MAX_TOKENS) -> List[Document]: - """Split documents into smaller chunks based on token size. - - Args: - documents (List[Document]): Documents to split - max_tokens (Optional[int], optional): Maximum tokens per chunk. - Defaults to LLMEC.MAX_TOKENS. - - Returns: - List[Document]: List of split document chunks - """ - split_docs = [] - - def process_doc(doc: Document) -> List[Document]: - chunks = chunk_text( - doc.page_content, - max_tokens=max_tokens, - overlap_percent=LLMEC.DEFAULT_OVERLAP_PERCENT - ) - return [ - Document( - page_content=chunk, - metadata=doc.metadata.copy()) for chunk in chunks] - - results = shared_executor.map(process_doc, documents) - - for result in results: - split_docs.extend(result) - - return split_docs - - -def append_metadata( - documents: list[Document], metadata: Dict[str, Dict[str, str]]) -> list[Document]: - """ - Append metadata to documents based on file path matching. - - Args: - documents (list[Document]): List of documents to update - metadata (Dict[str, Dict[str, str]]): Dictionary of metadata keyed by file path - - Returns: - list[Document]: Documents with updated metadata - """ - - metadata_lookup = { - value.get("file_path"): value for value in metadata.values()} - for document in documents: - if document.metadata.get("source") in metadata_lookup: - document.metadata = metadata_lookup[document.metadata.get( - "source")] - return documents - - -def split_and_append_metadata( - docs_path: str, metadata: Dict[str, Dict[str, str]]) -> list[Document]: - """ - Load documents, append metadata, and split them into smaller chunks. - - Args: - docs_path (str): Path to the documents - metadata (Dict[str, Dict[str, str]]): Dictionary of metadata to append - - Returns: - list[Document]: List of processed and split documents with metadata - """ - - documents = load_document(docs_path) - documents = append_metadata(documents, metadata) - documents = split_document(documents) - return documents - - -def create_batches(docs_path: str, - metadata: Dict[str, - Dict[str, - str]], - batch_element_size: int) -> List[List[Document]]: - """ - Create batches of documents with metadata for processing. - - Args: - docs_path (str): Path to the documents - metadata (Dict[str, Dict[str, str]]): Dictionary of metadata to append - batch_element_size (int): Size of each batch - - Returns: - List[List[Document]]: List of document batches - """ - - documents = split_and_append_metadata(docs_path, metadata) - batches = [] - for i in range(0, len(documents), batch_element_size): - batches.append(documents[i:i + batch_element_size]) - return batches - - -async def create_batches_from_doc( - documents: List[Document], batch_element_size: int) -> List[List[Document]]: - """ - Create batches of documents from a list of texts. - - args: - documents (List[Document]): List of documents to split into batches - - returns: - List[List[Document]]: List of document batches - """ - batches = [] - - split_documents = split_document(documents) - - for i in range(0, len(split_documents), batch_element_size): - batches.append(split_documents[i:i + batch_element_size]) - return batches - - -def count_tokens(text: str) -> int: - """Count the number of tokens in a text string. - - Args: - text (str): Text to count tokens for - - Returns: - int: Number of tokens in the text - """ - """Roughly estimates number of tokens based on word count. - This is a conservative estimate that tends to overestimate rather than underestimate.""" - # Most tokenizers average 1.3-1.5 tokens per word - # amazonq-ignore-next-line - return int(len(text.split()) * 1.5) - - -def process_chunk( - sentences: List[str], - max_tokens: int = LLMEC.MAX_TOKENS, - overlap_percent: int = LLMEC.DEFAULT_OVERLAP_PERCENT) -> List[str]: - """Process a list of sentences into overlapping chunks. - - Args: - sentences (List[str]): List of sentences to process - max_tokens (int, optional): Maximum tokens per chunk. - Defaults to LLMEC.MAX_TOKENS. - overlap_percent (int, optional): Percentage of overlap between chunks. - Defaults to LLMEC.DEFAULT_OVERLAP_PERCENT. - - Returns: - List[str]: List of processed text chunks - """ - """Processes a set of sentences into properly formatted text chunks.""" - chunks = [] - current_chunk = [] - current_tokens = 0 - - for sentence in sentences: - sentence_tokens = count_tokens(sentence) - # If a single sentence is too large, split it - if sentence_tokens > max_tokens: - if current_chunk: - chunks.append(" ".join(current_chunk)) - current_chunk = [] - current_tokens = 0 - - # Force split using RecursiveCharacterTextSplitter - splitter = RecursiveCharacterTextSplitter( - chunk_size=max_tokens, chunk_overlap=min( - overlap_percent, int( - max_tokens * 0.1))) - sub_chunks = splitter.split_text(sentence) - chunks.extend(sub_chunks) - continue - - # If adding this sentence exceeds chunk size, save current chunk - if current_tokens + sentence_tokens > max_tokens: - chunks.append(" ".join(current_chunk)) - - # Apply overlap - overlap_size = max( - 1, int(len(current_chunk) * (overlap_percent / 100))) - # Retain overlap context - current_chunk = current_chunk[-overlap_size:] - current_tokens = count_tokens(" ".join(current_chunk)) - - # Add sentence to chunk - current_chunk.append(sentence) - current_tokens += sentence_tokens - - # Add last chunk - if current_chunk: - chunks.append(" ".join(current_chunk)) - - return chunks - - -def chunk_text( - text: str, - max_tokens: int = LLMEC.MAX_TOKENS, - overlap_percent: int = LLMEC.DEFAULT_OVERLAP_PERCENT) -> List[str]: - """Split text into chunks with specified overlap. - - Args: - text (str): Text to split into chunks - max_tokens (int, optional): Maximum tokens per chunk. - Defaults to LLMEC.MAX_TOKENS. - overlap_percent (int, optional): Percentage of overlap between chunks. - Defaults to LLMEC.DEFAULT_OVERLAP_PERCENT. - - Returns: - List[str]: List of text chunks - """ - """Splits text into chunks in parallel, ensuring consistent sizes.""" - sentences = nltk.sent_tokenize(text) - return process_chunk(sentences, max_tokens, overlap_percent) diff --git a/backend/mainService/src/llm/Instructions.py b/backend/mainService/src/llm/Instructions.py deleted file mode 100644 index b7ac3b9..0000000 --- a/backend/mainService/src/llm/Instructions.py +++ /dev/null @@ -1,65 +0,0 @@ -SYSTEM_INSTRUCTION = """You are an expert in academic writing and citation formatting. - Your task is to: - 1. Insert **inline citations** where appropriate based on the provided sources. - 2. **Enclose** any directly lifted statements in quotation marks. - 3. Generate a **properly formatted Bibliography ** in {format} style using the given sources. - - Ensure that: - - Citations follow {format} formatting guidelines. - - The response is structured in **valid JSON format** with clear fields for the formatted text and Bibliography. - """ -USER_INSTRUCTION = """ - **Sources:** - {sources} - - using {format} style , provide inline citations and generate a bibliography for the texts below. - {text} -- ONLY use quotation marks for directly lifted statements and append the citation to the end of the sentence. -- Only cite texts that have a semantic match to a source. -- If no semantic match exists leave the text as it is. -- Ensure every intext citation corresponds to a reference and are accurately mapped. - -Return the response in JSON format with the fields listed below and no accompanying text: - - `"formatted_text"`: The text with inline citations. - - `"references"`: List of references used for the inline citations in {format} format. - "formatted reference in {format} format", - "formatted reference in {format} format", - ... - ] - - `"validation_notes"`: Optional. A list of notes explaining the citation decisions. -Note: If no intext citation, return an empty list for the references. - -- REFERENCE DEDUPLICATION: - - Compare reference details of the article name and author if the same, keep the most complete reference. - - When duplicates are found, merge them into a single reference entry,remove the duplicate and update the intext citation accordingly. - - """ -MERGE_CITATION_INSTRUCTION = """ - I have multiple JSON-formatted citation responses that contain different parts of an article. - Each response includes a formatted_text field with the article's content and a references field listing sources. - - Task: - Merge the formatted_text fields into a single coherent article. - Text: - {text} - - sample response: - {{ - "formatted_text": string, # The merged article. - "references": [ # list of unrepeated references formatted in {format} style. - string, - ... - ] - - validate your response by adhering to the following rules: - - The merged text is coherent and complete. - - The merged text has all references and they are correctly formatted. - - When references are adjusted, ensure the intext citation mapped to it is adjusted accordingly. - -If no intext citation, return an empty list for the references and leave text unchanged. - - - REFERENCE DEDUPLICATION: - - Compare reference details of the article name and author if the same, keep the most complete reference. - - When duplicates are found, merge them into a single reference entry,remove the duplicate and update the intext citation accordingly. - - }} -""" diff --git a/backend/mainService/src/llm/Pinecone.py b/backend/mainService/src/llm/Pinecone.py deleted file mode 100644 index 7b7b50b..0000000 --- a/backend/mainService/src/llm/Pinecone.py +++ /dev/null @@ -1,486 +0,0 @@ -from pinecone import ServerlessSpec, Index -from pinecone import PineconeAsyncio as Pinecone -from pinecone.data.index_asyncio import _IndexAsyncio -import os -from typing import List, Dict, Optional -from langchain.schema import Document -from pydantic import BaseModel, Field -import hashlib -from datetime import datetime - -""" -Pinecone Operations Module - -This module provides configuration and operation classes for interacting with the pinecone vector database -""" - - -class PineConeConfig(BaseModel): - """ - Configuration class for Vector Database settings. - - This class defines the configuration parameters needed for initializing and - operating with Pinecone and related language model services. - - Attributes: - pinecone_api_key (Optional[str]): API key for Pinecone service - max_pool_threads (int): Maximum number of threads for connection pool - cloud (str): Cloud provider (default: 'aws') - region (str): Cloud region (default: 'us-east-1') - default_dense_model (str): Default model for dense embeddings - default_sparse_model (str): Default model for sparse embeddings - default_dimension (int): Default embedding dimension size - """ - - pinecone_api_key: Optional[str] = Field(None, env="PINECONE_API_KEY") - max_pool_threads: int = Field(default=30, ge=1) - cloud: str = Field(default="aws") - region: str = Field(default="us-east-1") - default_dense_model: str = Field(default="multilingual-e5-large") - default_sparse_model: str = Field(default="pinecone-sparse-english-v0") - default_dimension: int = Field( - default=1024, ge=1) # Ensuring it's positive - - -class PineconeOperations: - """ - Handles operations with Pinecone vector database including indexing and querying. - - This class provides comprehensive functionality for managing Pinecone indexes, - performing vector operations, and handling hybrid search queries. - - Attributes: - _pc (Pinecone): Pinecone client instance - _spec (ServerlessSpec): Serverless specification for Pinecone - _current_index_host (str): Current active index host - _current_index (_IndexAsyncio): Current active index instance - _current_index_name (str): Name of the current active index - _default_dense_model (str): Default model for dense embeddings - _default_sparse_model (str): Default model for sparse embeddings - _default_dimension (int): Default embedding dimension - - Methods: - create: Factory method to create PineconeOperations instance - create_index: Create a new Pinecone index - upsert_documents: Insert or update documents in the index - hybrid_query: Perform hybrid search queries - rerank: Rerank search results - """ - - __from_create = False - - __slots__ = ( - '_pc', - '_spec', - '_current_index_host', - '_current_index', - '_current_index_name', - '_default_dense_model', - '_default_sparse_model', - '_default_dimension') - - def __init__(self, config: PineConeConfig, **kwargs): - """ - Private constructor. Users should not call this directly. - """ - - if not self.__from_create: - raise RuntimeError( - "Use PineconeOperations.create() to instantiate this class.") - - self._pc: Pinecone = None - self._spec = ServerlessSpec(cloud=config.cloud, region=config.region) - - # Configuration attributes - self._default_dense_model = config.default_dense_model - self._default_sparse_model = config.default_sparse_model - self._default_dimension = config.default_dimension - - # Mutable runtime attributes - self._current_index_host = None - self._current_index: _IndexAsyncio = None - self._current_index_name = None - - @classmethod - async def create(cls, config: Optional[PineConeConfig] = None, **kwargs): - """ - Asynchronously create a PineconeOperations instance. - - :param config: Configuration for the PineconeOperations instance. - :param kwargs: Additional keyword arguments for configuration. - :return: An instance of PineconeOperations. - """ - # Use provided config or create a default one - config = config or PineConeConfig(**kwargs) - api_key = config.pinecone_api_key or os.getenv("PINECONE_API_KEY") - if not api_key: - raise ValueError( - "PINECONE_API_KEY is required to initialize Pinecone.") - - cls.__from_create = True - instance = cls(config, **kwargs) - instance._pc = Pinecone( - api_key=api_key, - pool_threads=config.max_pool_threads) - return instance - - @property - def default_dense_model(self) -> str: - """ - Getter for default dense embedding model. - - A property decorator transforms a method into an attribute-like accessor. - It allows read-only access to the underlying attribute. - """ - return self._default_dense_model - - @property - def default_sparse_model(self) -> str: - """Getter for default sparse embedding model.""" - return self._default_sparse_model - - @property - def default_dimension(self) -> int: - """Getter for default embedding dimension.""" - return self._default_dimension - - def get_current_index(self) -> Optional[_IndexAsyncio]: - """Getter for the current active Pinecone index.""" - return self._current_index - - async def create_index( - self, - index_name: str, - dimension: Optional[int] = None, - metric: str = "dotproduct", - deletion_protection: str = "disabled" - ) -> Optional[Index]: - """ - Create a Pinecone index with specified parameters. - - :param index_name: Name of the index - :param dimension: Embedding dimension (uses default if not specified) - :param metric: Distance metric for index - :param deletion_protection: Deletion protection setting - :return: Created index or None if index already exists - """ - if not await self._pc.has_index(index_name): - index_model = await self._pc.create_index( - name=index_name, - metric=metric, - dimension=dimension or self._default_dimension, - deletion_protection=deletion_protection, - spec=self._spec - ) - await self.set_current_index(index_host=index_model.host, index_name=index_name) - if index_name != self._current_index_name: - await self.set_current_index(index_name) - return self._current_index - - async def set_current_index( - self, - index_name: str, - index_host: str = None) -> bool: - """ - Set the current active Pinecone index. - - :param index_name: Name of the index to set as current - """ - if not await self._pc.has_index(index_name): - return False - if not self._current_index_name == index_name and self._current_index: - await self._current_index.close() - elif self._current_index_name == index_name: - return True - - if not index_host: - index_model = await self._pc.describe_index(index_name) - self._current_index_host = index_model.host - else: - self._current_index_host = index_host - self._current_index_name = index_name - self._current_index = self._pc.IndexAsyncio( - host=self._current_index_host) - return True - - async def get_dense_embeddings( - self, - input_data: List[str], - model: Optional[str] = None, - input_type: str = "passage" - ) -> Dict: - """ - Generate dense embeddings for given inputs. - - :param input_data: List of text inputs - :param model: Embedding model (uses default if not specified) - :param input_type: Type of input for embedding - :return: Dense embeddings - """ - result = await self._pc.inference.embed( - model=model or self._default_dense_model, - inputs=input_data, - parameters={ - "input_type": input_type, - "truncate": "END" - } - ) - return result.data - - async def get_sparse_embeddings( - self, - input_data: List[str], - model: Optional[str] = None, - input_type: str = "passage" - ) -> Dict: - """ - Generate sparse embeddings for given inputs. - - :param input_data: List of text inputs - :param model: Sparse embedding model (uses default if not specified) - :param input_type: Type of input for embedding - :return: Sparse embeddings - """ - result = await self._pc.inference.embed( - model=model or self._default_sparse_model, - inputs=input_data, - parameters={ - "input_type": input_type, - "truncate": "END" - } - ) - - return result.data - - def make_id( - self, - metadata: Dict, - chunk_num: int, - batch_num: Optional[int]) -> str: - basename = str( - os.path.basename( - metadata.get( - "file_path", - ""))).replace( - " ", - "-").removesuffix(".pdf") - page_num = metadata.get("page", "") - hash_ = f"{hashlib.sha256(str(datetime.now().timestamp()).encode()).hexdigest()[-12:]}" - if batch_num: - return f"{basename}-{page_num}-{chunk_num}-{hash_}-{batch_num}" - return f"{basename}-{page_num}-{chunk_num}-{hash_}" - - # TODO: Pinecone allows upserting 1000 document in one batch request. - # currently , we are sending 90 documents in one batch request for - # embedding. - - async def upsert_documents( - self, - batches: List[List[Document]], - dense_model: Optional[str] = None, - sparse_model: Optional[str] = None, - ) -> None: - """ - Embed and upsert documents to the current index. - - :param documents: List of Langchain Documents - :param dense_model: Optional custom dense embedding model - :param sparse_model: Optional custom sparse embedding model - """ - chunk_num = 1 - if not self._current_index: - raise ValueError("No active index. Create or set an index first.") - upsert_vectors = [] - for batch_num, documents in enumerate(batches): - - texts = [doc.page_content for doc in documents] - dense_embeddings = await self.get_dense_embeddings(texts, model=dense_model) - sparse_embeddings = await self.get_sparse_embeddings(texts, model=sparse_model) - - for doc, dense, sparse in zip( - documents, dense_embeddings, sparse_embeddings): - doc.metadata["page_content"] = doc.page_content - id = self.make_id(doc.metadata, chunk_num, batch_num) - doc.metadata["id"] = id - upsert_vector = { - 'id': id, - 'values': dense.get('values'), - 'sparse_values': { - 'values': sparse.get('sparse_values'), - 'indices': sparse.get('sparse_indices') - }, - 'metadata': doc.metadata - } - upsert_vectors.append(upsert_vector) - chunk_num += 1 - - if batch_num == 9 or batch_num == len(batches) - 1: - await self._current_index.upsert(vectors=upsert_vectors, async_req=True) - upsert_vectors = [] - chunk_num = 1 - - def hybrid_score_norm(self, dense, sparse, alpha: float): - """Hybrid score using a convex combination - - alpha * dense + (1 - alpha) * sparse - - Args: - dense: Array of floats representing - sparse: a dict of `indices` and `values` - alpha: scale between 0 and 1 - """ - if alpha < 0 or alpha > 1: - raise ValueError("Alpha must be between 0 and 1") - if sparse: - hs = { - 'indices': sparse['indices'], - 'values': [v * (1 - alpha) for v in sparse['values']] - } - else: - raise ValueError("Sparse vector cannot be None or empty") - return [v * alpha for v in dense], hs - - async def sparse_query( - self, - query: str | Dict, - top_k: int = 10, - include_metadata: bool = True - ) -> Dict: - """Perform a sparse vector query.""" - if isinstance(query, str): - sparse = await self.get_sparse_embeddings([query], input_type="query")[0] - sparse_vector = { - "values": sparse.get("sparse_values"), - "indices": sparse.get("sparse_indices") - } - query = { - "sparse_vector": sparse_vector, - "top_k": top_k, - "include_metadata": include_metadata - } - - return await self._current_index.query(**query) - - async def dense_query( - self, - query: str | Dict, - top_k: int = 10, - include_metadata: bool = True - ) -> Dict: - """Perform a dense vector query.""" - if isinstance(query, str): - dense_vector = self.get_dense_embeddings( - [query], input_type="query")[0].get("values") - query = { - "vector": dense_vector, - "top_k": top_k, - "include_metadata": include_metadata - } - - return await self._current_index.query(**query) - - async def hybrid_query( - self, - query: str | Dict, - top_k: int = 10, - include_metadata: bool = True, - ) -> Dict: - """ - Query the current index with flexible input options. - - :param query: Query string or pre-prepared query dictionary - :param top_k: Number of results to return - :param include_metadata: Whether to include document metadata - :return: Query results - """ - if not self._current_index: - raise ValueError("No active index. Create or set an index first.") - - if isinstance(query, str): - dense_vector = await self.get_dense_embeddings([query], input_type='query') - dense_vector = dense_vector[0].get("values") - sparse = await self.get_sparse_embeddings([query], input_type='query') - sparse = sparse[0] - sparse_vector = { - "values": sparse.get("sparse_values"), - "indices": sparse.get("sparse_indices") - } - - normalized_dense_vector, normalized_sparse_vector = self.hybrid_score_norm( - dense_vector, sparse_vector, alpha=0.5) - query = { - "vector": normalized_dense_vector, - "sparse_vector": normalized_sparse_vector, - "top_k": top_k, - "include_metadata": include_metadata - } - elif isinstance(query, dict): - query["top_k"] = query.get("top_k", top_k) - query["include_metadata"] = query.get( - "include_metadata", include_metadata) - else: - raise ValueError("""Query must be a string or a dictionary in - { - "vector": hdense, - "sparse_vector": hsparse, - "top_k": top_k, - "include_metadata": include_metadata - } format""") - - return await self._current_index.query(**query) - - async def delete_index(self, index_name: str) -> None: - """ - Delete a Pinecone index. - - :param index_name: Name of the index to delete - """ - await self._pc.delete_index(index_name, timeout=-1) - - async def rerank( - self, - query: str, - matches: List[Dict], - model: str = "bge-reranker-v2-m3", - top_n: int = 3) -> List[Dict]: - """ - Rerank the top-k matches using the retrieved text. - - :param query: User's search query - :param matches: List of retrieved documents (from Pinecone search) - :param top_k: Number of results to return after reranking - :return: Reranked list of documents - """ - rerank_doc = [] - for match in matches: - rerank_doc.append({ - 'id': match.get('id'), - "page_content": match.get('metadata').get('page_content'), - "metadata": match.get('metadata') - }) - result = await self._pc.inference.rerank( - model=model, - query=query, - documents=rerank_doc, - top_n=top_n, - rank_fields=["page_content"], - return_documents=True, - parameters={"truncate": "END"}) - - return result - - async def get_idx_stat(self) -> Dict: - """Get index statistics.""" - stat = await self._current_index.describe_index_stats() - return stat.total_vector_count - - async def cleanup(self) -> None: - """Cleanup resources.""" - if self._current_index: - await self._current_index.close() - if self._pc: - await self._pc.close() - - def has_index(self, index_name) -> bool: - """Check if an index is currently active.""" - return self._pc.has_index(index_name) diff --git a/backend/mainService/src/llm/__init__.py b/backend/mainService/src/llm/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/llm/chat_llm/Azure_llm.py b/backend/mainService/src/llm/chat_llm/Azure_llm.py deleted file mode 100644 index 1e8bb84..0000000 --- a/backend/mainService/src/llm/chat_llm/Azure_llm.py +++ /dev/null @@ -1,213 +0,0 @@ -import json -import os -import re -from azure.ai.inference import ChatCompletionsClient -from azure.ai.inference.models import SystemMessage, UserMessage -from azure.core.credentials import AzureKeyCredential -from typing import List, Optional, Dict, Union, Any -from src.llm.Instructions import * -from src.llm.chat_llm.Gemini_llm import Genai_cite -import asyncio -from src.config.log_config import setup_logging -from src.custom_exceptions.api_exceptions import MissingApiKeyException, InvalidApiKeyException, MissingEndpointException -from azure.core.exceptions import HttpResponseError -from azure.ai.inference.models import ChatCompletions -from src.custom_exceptions.llm_exceptions import CitationGenerationError -import logging -from concurrent.futures import ThreadPoolExecutor -from src.config.config import concurrency_config - -logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( - logging.WARNING) - - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) -RESPONSE_CLEANUP_PATTERN = re.compile(r'^(```json\n|```|json|\n)|(```|\n)$') - - -_executor = ThreadPoolExecutor( - max_workers=concurrency_config.DEFAULT_CONCURRENT_WORKERS) - - -class Citation: - model = "Phi-4" - embedding_model = "text-embedding-3-small" - - def __init__( - self, - source: List[str] = None, - api_key: str = None, - model: Optional[str] = None, - endpoint: Optional[str] = None) -> None: - """Initialize the Azure hosted LLM client. - - Args: - source (str, optional): Source identifier. Defaults to None. - api_key (str, optional): Azure API key. Defaults to global api_key. - model (Optional[str], optional): Name of the model to use. Defaults to None. - endpoint (Optional[str], optional): Azure endpoint URL. Defaults to None. - """ - self.api_key = api_key or get_azure_api_key("AZURE_CREDENTIAL") - self.model_name = model or self.model - self.source = source - self.client = ChatCompletionsClient( - endpoint=endpoint or get_azure_endpoint("AZURE_MODELS_ENDPOINT"), - credential=AzureKeyCredential(self.api_key), - ) - self.merger = Genai_cite() - - async def cite(self, - text: List[str], - citation_style: str) -> Dict[str, - Union[str, - List[Dict[str, - str]]]]: - """Generate citations for given text passages. - - Args: - text (List[str]): List of text passages to generate citations for - format (str): Citation format (e.g., "APA", "MLA") - - Returns: - Dict[str, Union[str, List[Dict[str, str]]]]: Dictionary containing citations and metadata - """ - # amazonq-ignore-next-line - batch_size = max(1, len(text) // 10) - try: - tasks = [self._cite(text[i:i + batch_size], citation_style) - for i in range(0, len(text), batch_size)] - citations = await asyncio.gather(*tasks) - merged_citations = await self.merger.merge_citation(citations, format=citation_style) - except Exception as e: - logger.exception(f"Error in citation generation: {e}") - raise CitationGenerationError( - f"""Citation generation failed for { - len(text)} text passages: {e}""") from e - - return merged_citations - - async def _cite(self, text: str | - List[str], format: str) -> Dict[str, Any]: - """Internal method to process citation requests. - - Args: - text (Union[str, List[str]]): Text or list of texts to generate citations for - format (str): Citation format (e.g., "APA", "MLA") - - Returns: - Dict[str, Any]: Processed citation results - """ - messages = [ - SystemMessage( - content=SYSTEM_INSTRUCTION.format( - format=format)), UserMessage( - content=USER_INSTRUCTION.format( - text=text, sources=self.source, format=format)), ] - model = self.model_name - # Offload blocking work to a thread - logger.info(f"Sending request to Azure API with messages") - result = await asyncio.get_running_loop().run_in_executor( - _executor, - self._blocking_citation_request, - messages, - model - ) - - return result - - def _blocking_citation_request( - self, messages: List[str], model_name: str = None) -> Dict[str, Any]: - """Make a blocking citation request to the Azure API. - - Args: - messages (List[str]): List of messages to process - model_name (str, optional): Model to use for citation. Defaults to None. - - Returns: - Dict[str, Any]: Raw API response containing citation data - """ - try: - response: ChatCompletions = self.client.complete(messages=messages, model=( - model_name or self.model_name), temperature=0.1, top_p=0.1) - response_content = response.choices[0].message.content - # amazonq-ignore-next-line - response_content = response_content.strip() - response_content = re.sub( - RESPONSE_CLEANUP_PATTERN, '', response_content) - result = json.loads(response_content) - except HttpResponseError as e: - logger.exception(f"Error in establishing azure client: {e}") - raise e - except json.JSONDecodeError as e: - logger.warning(f"Error in decoding json: {e}") - return {"unformatted_response": response} - return result - - -def validate_azure_api_key(api_key: str) -> bool: - """ - Validate Azure API key format. - - Args: - api_key (str): The API key to validate - - Returns: - bool: True if valid, False otherwise - - """ - # Basic pattern for Azure API keys - adjust as needed - pattern = r'^[a-zA-Z0-9]{32,}$' - return bool(re.match(pattern, api_key)) - - -def get_azure_api_key(key: str) -> str: - """ - Retrieve and validate Azure API key from environment variables. - - Returns: - str: Valid API key - - Raises: - MissingApiKeyException: If API key is not set in environment - InvalidApiKeyException: If API key format is invalid - """ - api_key = os.getenv(key, "") - - if not api_key: - raise MissingApiKeyException( - "AZURE_PHI_CREDENTIAL key missing from environment variables" - ) - - if not validate_azure_api_key(api_key): - raise InvalidApiKeyException( - "AZURE_PHI_CREDENTIAL has invalid format. Please check your API key" - ) - return api_key - - -def get_azure_endpoint(endpoint: str) -> str: - """ - Retrieve and validate Azure endpoint from environment variables. - - Returns: - str: Valid endpoint - - Raises: - MissingEndpointException: If endpoint is not set in environment - InvalidEndpointException: If endpoint format - """ - - endpoint = os.getenv(endpoint, "") - if not endpoint: - raise MissingEndpointException( - "AZURE_ENDPOINT key missing from environment variables" - ) - return endpoint - - -def __del__(self): - """ - Cleanup resources when the object is destroyed. - """ - _executor.shutdown(wait=True) diff --git a/backend/mainService/src/llm/chat_llm/Gemini_llm.py b/backend/mainService/src/llm/chat_llm/Gemini_llm.py deleted file mode 100644 index 9fabc7a..0000000 --- a/backend/mainService/src/llm/chat_llm/Gemini_llm.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -from typing import List, Dict -from src.config.log_config import setup_logging -from google import genai -from google.genai import types -from src.llm.Instructions import MERGE_CITATION_INSTRUCTION -import json - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - - -class Genai_cite: - model = "gemini-2.0-flash" - - def __init__(self, api_key: str = os.getenv("GOOGLE_API_KEY"), - llm_model: str = f'models/{model}'): - self.api_key = api_key - self.client = genai.Client(api_key=self.api_key) - self.llm_model = llm_model - - async def merge_citation( - self, text: List[Dict[str, str]], format: str) -> Dict | bool: - try: - response = await self.client.aio.models.generate_content( - model=self.llm_model, - config=types.GenerateContentConfig(response_mime_type="application/json"), - contents=MERGE_CITATION_INSTRUCTION.format(text=text, format=format) - ) - - logger.info(f"usage: {response.usage_metadata}") - - # Parse the response text to JSON - try: - # Remove any potential markdown code block indicators - clean_text = response.text.strip('`').replace( - 'json\n', '').replace('\n', '') - result = json.loads(clean_text) - return result - except json.JSONDecodeError as je: - logger.error(f"Failed to parse LLM response as JSON: {je}") - logger.debug(f"Raw response text: {response.text}") - raise - - except Exception as e: - logger.exception(f"Error in merging citation: {e}") - raise e diff --git a/backend/mainService/src/llm/chat_llm/Groq_llm.py b/backend/mainService/src/llm/chat_llm/Groq_llm.py deleted file mode 100644 index a2a6370..0000000 --- a/backend/mainService/src/llm/chat_llm/Groq_llm.py +++ /dev/null @@ -1,80 +0,0 @@ -from groq import Groq -import os -import re -import json -from src.config.log_config import setup_logging -from typing import Optional -from json.decoder import JSONDecodeError -from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class Summarize_llm: - - def __init__(self, api_key: str = os.getenv("GROQ_API_KEY"), - llm_model: str = "llama-3.3-70b-versatile"): - self.api_key = api_key - self.client = Groq(api_key=self.api_key) - self.llm_model = llm_model - - def getKeywordSearchTerm(self, document: str, proposed_title: Optional[str] = None) -> str: - """ - Generate a search term from the provided document using LLM. - - Args: - document: Input text to generate search term from - - Returns: - str: Generated search term or error message - - Raises: - LLMError: If there's an error in LLM processing - """ - try: - # Input validation - if not document or not isinstance(document, str): - logger.warning("Invalid or empty document provided") - return "No content to summarize" - - # Trim document if too long - max_length = 4000 # Adjust based on model's context window - if len(document) > max_length: - logger.warning(f"Document truncated from {len(document)} to {max_length} characters") - document = document[:max_length] - - # Make API call with error handling - - if proposed_title: - document = f"Here is the proposed title: {proposed_title}\n\nHere is the content: {document}" - else: - document = f"Here is the content: {document}" - - completion = self.client.chat.completions.create( - model=self.llm_model, - messages=[ - { - "role": "user", - "content": f"summarize the provided into a google search term and return a json response as 'search_term : value', if no content provided, your response should be 'message:no content to summarize'.{document}" - }, - ], - temperature=0.9, - top_p=1, - max_tokens=1024, - stream=False, - stop=None, - response_format={"type": "json_object"} - ) - result = completion.choices[0].message.content - return json.loads(result).get( - "search_term") or json.loads(result).get("message") - except JSONDecodeError: - logger.error("Failed to decode JSON response") - result = re.sub( - r'^(```json\n|```|json|\n|\{)|(```|\n|\})$', '', result) - return result - - except Exception as e: - logger.error(f"Unexpected error in getKeywordSearchTerm: {str(e)}") - raise SearchKeyGenerationError(f"Unexpected error: {str(e)}") diff --git a/backend/mainService/src/llm/chat_llm/__init__.py b/backend/mainService/src/llm/chat_llm/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/llm/embedding_utils/reranker.py b/backend/mainService/src/llm/embedding_utils/reranker.py deleted file mode 100644 index 0579182..0000000 --- a/backend/mainService/src/llm/embedding_utils/reranker.py +++ /dev/null @@ -1,40 +0,0 @@ -from mixedbread_ai.client import AsyncMixedbreadAI, RerankingResponse -import os -from typing import Dict, List -from src.custom_exceptions.api_exceptions import MissingApiKeyException -from dotenv import load_dotenv - -load_dotenv() - -model = "mixedbread-ai/mxbai-rerank-large-v1" -api_key = os.getenv("MIXBREAD_API_KEY", "") -if not api_key: - raise MissingApiKeyException( - "MIXBREAD_API_KEY is required to initialize MixedbreadAI.") - - -async def rerank(query: str, - matches: List[Dict[str, - str]], - rank_fields: List = [], - top_n: int = 3) -> list[Dict]: - mxbai = AsyncMixedbreadAI(api_key=api_key) - - reranked_docs: RerankingResponse = await mxbai.reranking( - model=model, - query=query, - input=matches, - return_input=True, - top_k=top_n, - rank_fields=rank_fields, - ) - - # reranked_result = [r.input for r in reranked_docs.data] - return reranked_docs - - -def format_for_rerank(matches: List[Dict[str, str]]) -> list[Dict]: - return [{ - "page_content": metadata.get('page_content'), - "metadata": metadata - } for match in matches if (metadata := match.get('metadata'))] diff --git a/backend/mainService/src/llm/embedding_utils/vector_embed.py b/backend/mainService/src/llm/embedding_utils/vector_embed.py deleted file mode 100644 index 26a29b6..0000000 --- a/backend/mainService/src/llm/embedding_utils/vector_embed.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -import re -from azure.ai.inference import EmbeddingsClient -from azure.core.credentials import AzureKeyCredential -from typing import List, Optional -from src.config.log_config import setup_logging -from src.custom_exceptions.api_exceptions import MissingApiKeyException, InvalidApiKeyException -from azure.core.exceptions import HttpResponseError -from azure.ai.inference.models import EmbeddingsResult -import logging - - -"""Currently using the pinecone embedding model. This will be replacing the Pinecone embedding model(Maybe)""" - -logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel( - logging.WARNING) - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -def vector_embed(text: List[str], model: Optional[str] = None) -> List[float]: - """Embed a text using the Azure LLM.""" - - embedding_model = "text-embedding-3-small" - api_key = get_azure_api_key("AZURE_CREDENTIAL") - - client = EmbeddingsClient( - endpoint=os.getenv("AZURE_MODELS_ENDPOINT", ""), - credential=AzureKeyCredential(api_key), - ) - try: - response: EmbeddingsResult = client.embed( - input=text, - model=model or embedding_model, - dimensions=1536 - ) - doc_embeds = [r.embedding for r in response.data] - return doc_embeds - - except HttpResponseError as e: - logger.exception(f"Error in establishing azure client: {e}") - raise e - - -def validate_azure_api_key(api_key: str) -> bool: - """ - Validate Azure API key format. - - Args: - api_key (str): The API key to validate - - Returns: - bool: True if valid, False otherwise - - Note: This is a basic validation - adjust pattern based on your Azure key format - """ - # Basic pattern for Azure API keys - adjust as needed - pattern = r'^[a-zA-Z0-9]{32,}$' - return bool(re.match(pattern, api_key)) - - -def get_azure_api_key(key: str) -> str: - """ - Retrieve and validate Azure API key from environment variables. - - Returns: - str: Valid API key - - Raises: - MissingApiKeyException: If API key is not set in environment - InvalidApiKeyException: If API key format is invalid - """ - api_key = os.getenv(key, "") - - if not api_key: - raise MissingApiKeyException( - "AZURE_PHI_CREDENTIAL key missing from environment variables" - ) - - if not validate_azure_api_key(api_key): - raise InvalidApiKeyException( - "AZURE_PHI_CREDENTIAL has invalid format. Please check your API key" - ) - return api_key diff --git a/backend/mainService/src/models/schema.py b/backend/mainService/src/models/schema.py deleted file mode 100644 index cac7d8f..0000000 --- a/backend/mainService/src/models/schema.py +++ /dev/null @@ -1,46 +0,0 @@ -from typing import List, Optional, Union, Literal -from pydantic import BaseModel, Field -from datetime import datetime, timezone as tz - - -class Source(BaseModel): - url: Optional[str] = None - content: Optional[str] = None - title: str - authors: str - type: str = 'website' - publishedDate: Optional[str] = None - doi: Optional[str] = None - volume: Optional[str] = None - accessDate: Optional[str] = Field( - default=datetime.now( - tz.utc).strftime("%Y-%m-%d"), - alias="access_date") - - -class AutoCitationInput(BaseModel): - title: str - content: str = Field(default="") - formType: Literal["auto"] - citationStyle: Optional[str] = "APA" - - -class WebCitationInput(BaseModel): - title: str - content: str = Field(default="") - formType: Literal["web"] - citationStyle: Optional[str] = "APA" - supplementUrls: bool = False - sources: List[Source] - - -class DirectSourceCitationInput(BaseModel): - title: str - content: str - formType: Literal["source"] - citationStyle: Optional[str] = "APA" - sources: List[Source] - - -CitationInput = Union[AutoCitationInput, - WebCitationInput, DirectSourceCitationInput] diff --git a/backend/mainService/src/scraper/__init__.py b/backend/mainService/src/scraper/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/scraper/async_base_scraper.py b/backend/mainService/src/scraper/async_base_scraper.py deleted file mode 100644 index 457064f..0000000 --- a/backend/mainService/src/scraper/async_base_scraper.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -Base Scraper Module - -This module provides the foundational class for web scraping operations using Playwright. -It implements core functionality for browser interactions and file downloads while -allowing specific implementations to be defined in child classes. - -The BasePlaywrightScraper class provides: -- Browser context management -- File download handling -- Error handling and logging -- Abstract methods for child class implementation - -Classes: - BasePlaywrightScraper: Abstract base class for Playwright-based web scrapers -""" - -from playwright.async_api import BrowserContext -from typing import Optional -import os -from abc import ABC, abstractmethod -import asyncio -from src.config.playwright_driver import PlaywrightDriver -from urllib3.util import parse_url -from src.config.log_config import setup_logging -from src.config.config import scraper_config - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class BasePlaywrightScraper(ABC): - """ - Abstract base class for implementing Playwright-based web scraping operations. - - This class provides core functionality for handling browser interactions and file downloads - using Playwright's async API. It implements common scraping patterns and utilities while - leaving specific implementation details to child classes. - - Attributes: - context (BrowserContext): Playwright browser context for managing browser sessions - PD (PlaywrightDriver): Custom Playwright driver instance for browser automation - - Methods: - _handle_download: Internal method to handle file downloads - download_pdf: Abstract method to be implemented by child classes for PDF downloads - """ - - def __init__( - self, - context: BrowserContext, - playwright_driver: PlaywrightDriver): - """ - Initialize the base scraper with browser context and playwright driver. - - Args: - context (BrowserContext): Playwright browser context for managing browser sessions - playwright_driver (PlaywrightDriver): Instance of custom Playwright driver - """ - - self.context = context - self.PD = playwright_driver - - async def _handle_download( - self, - storage_dir: str, - url: str = None, - timeout: int = None) -> str | bool: - """ - Handle file download with Playwright's built-in download capabilities. - - This method manages the download process including page creation, download triggering, - and file saving with proper error handling and timeout management. - - Args: - storage_dir (str): Directory path where the downloaded file will be saved - url (str, optional): URL to download the file from - timeout (int, optional): Custom timeout duration for the download operation - - Returns: - str: Path to the downloaded file if successful - bool: False if download fails - - Raises: - asyncio.TimeoutError: If download exceeds timeout duration - Exception: For any other errors during download process - """ - - try: - logger.info(f"Starting download from: {url}") - page = await self.PD.get_new_page(self.context) - - async with page.expect_download(timeout=scraper_config.TIMEOUT_DURATION) as download_info: - await page.evaluate(f"window.open('{url}')") - - logger.info("Download triggered, waiting for file...") - # Prevent indefinite hang - download = await asyncio.wait_for(download_info.value, timeout=timeout or scraper_config.TIMEOUT_DURATION) - - suggested_filename = download.suggested_filename or parse_url( - url).path.split('/')[-1] - - download_path = os.path.join(storage_dir, suggested_filename) - logger.info(f"Saving file to: {download_path}") - - await download.save_as(download_path) - logger.info("Download completed successfully.") - - if os.path.exists(download_path) and os.path.getsize( - download_path) > 0: - return download_path - logger.error("Error occured during file saving") - return False - - except asyncio.TimeoutError: - logger.exception("Download timed out") - return False - - except Exception as e: - logger.exception(f"Error during download: {e}", exc_info=True) - return False - - finally: - if 'page' in locals(): - await page.close() - - @abstractmethod - async def download_pdf( - self, - url: str, - download_path: str) -> Optional[str]: - """ - Abstract method to download PDF from the given URL. - - This method must be implemented by child classes to provide specific PDF - download functionality. - - Args: - url (str): URL of the PDF to download - download_path (str): Path where the PDF should be saved - - Returns: - Optional[str]: Path to the downloaded PDF if successful, None otherwise - - Raises: - NotImplementedError: If the child class doesn't implement this method - """ - - pass diff --git a/backend/mainService/src/scraper/async_content_scraper.py b/backend/mainService/src/scraper/async_content_scraper.py deleted file mode 100644 index a0042da..0000000 --- a/backend/mainService/src/scraper/async_content_scraper.py +++ /dev/null @@ -1,214 +0,0 @@ -""" -Content Scraper Module - -This module provides the main implementation for scraping and processing content -from various sources. It handles both PDF downloads and content extraction, -providing a unified interface for different types of content. - -The AsyncContentScraper class provides: -- PDF download management -- Content extraction and processing -- Site-specific scraper integration -- Error handling and logging - -Classes: - AsyncContentScraper: Main class for handling content scraping operations -""" - -from urllib3.util import parse_url -from typing import Dict, Optional, Union -import os -from src.scraper.site_specific.async_ibm_scraper import IBMScraper -from src.scraper.site_specific.async_frontier_scraper import FrontierScraper -from src.scraper.site_specific.async_generic_scraper import GenericScraper -from src.scraper.async_base_scraper import BasePlaywrightScraper -from src.utils.web_utils import WebUtils -from src.utils.file_utils import FileUtils -from src.config.playwright_driver import PlaywrightDriver -import asyncio -from datetime import datetime -from playwright.async_api import Browser, BrowserContext -from src.config.log_config import setup_logging -from datetime import timezone as tz - - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - -""" -Citation Content Scraper Module - -This module provides asynchronous functionality for scraping citation content from various sources. -It handles concurrent requests and content extraction efficiently using asyncio. - -Classes: - AsyncContentScraper: Main class for handling asynchronous content scraping operations -""" - - -class AsyncContentScraper: - """ - A class to handle asynchronous content scraping operations. - - This class provides methods to concurrently fetch and process content from multiple URLs, - optimizing performance through asynchronous operations. - - Attributes: - session (aiohttp.ClientSession): Async session for making HTTP requests - timeout (int): Maximum time in seconds to wait for a response - max_retries (int): Maximum number of retry attempts for failed requests - - Methods: - get_content(url: str) -> str: - Asynchronously retrieves content from a given URL - - process_urls(urls: List[str]) -> List[Dict]: - Processes multiple URLs concurrently and returns their content - """ - - def __init__(self, playwright_driver: PlaywrightDriver = None): - """ - Initialize the AsyncContentScraper with an optional playwright driver. - - This constructor sets up the initial state of the scraper, including browser and context - attributes that will be initialized when the scraper is used as a context manager. - - Args: - playwright_driver (PlaywrightDriver, optional): Instance of PlaywrightDriver for browser automation. - If not provided, a new instance will be created when entering the context manager. - - Attributes: - scraper_driver (PlaywrightDriver): The playwright driver instance - _browser (Browser): Playwright browser instance, initialized in context manager - _context (BrowserContext): Playwright browser context, initialized in context manager - current_download_path (str): Path where downloads are currently being stored - """ - - self.scraper_driver: PlaywrightDriver = playwright_driver - self._browser: Browser = None - self._context: BrowserContext = None - self.current_download_path: str = None - - async def __aenter__(self): - self.scraper_driver = self.scraper_driver or await PlaywrightDriver.create() - self._browser = await self.scraper_driver.get_browser() - self._context = await self.scraper_driver.get_context() - await self._setup_scrapers() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - try: - if self._context: - await self._context.close() - except Exception as e: - # Log the exception even if it occurred during cleanup - logger.critical(f"Exception while closing context: {e}",exc_info=True) - - if exc_type: - logger.error("Exception in context manager", exc_info=(exc_type, exc_val, exc_tb)) - - async def _setup_scrapers(self): - self.scrapers: Dict[BasePlaywrightScraper] = { - "research.ibm.com": IBMScraper( - self._context, self.scraper_driver), "www.frontiersin.org": FrontierScraper( - self._context, self.scraper_driver), "default": GenericScraper( - self._context, self.scraper_driver)} - - async def get_pdf(self, - target_url: str, - storage_path: Optional[str] = None) -> tuple[str, - Optional[str], - str] | bool: - """Download a PDF from the specified URL. - - Args: - target_url (str): The URL of the PDF to download - storage_path (Optional[str], optional): Path where to store the PDF. - If not provided, a default path will be generated. Defaults to None. - - Returns: - tuple[str, Optional[str], str]: A tuple containing: - - The original target URL - - The path where the PDF was saved (None if failed) - - The storage directory path - """ - try: - parsed_url = parse_url(target_url) - base_url = f"{parsed_url.scheme}://{parsed_url.host}" - - # Set up download path - if not storage_path: - default_path = parsed_url.host + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) - storage_path = os.path.join( - os.getcwd(), "downloads", default_path) - else: - storage_path = os.path.abspath(storage_path) - - self.current_download_path = storage_path - - # Check robots.txt - can_fetch, _ = WebUtils.check_robots_txt( - base_url, target_url, "Mozilla/5.0") - if not can_fetch: - logger.warning(f"can't fetch {target_url}") - return False - - # Ensure download directory exists - if not FileUtils.ensure_directory(storage_path): - logger.critical("Failed to create download directory") - raise OSError("Failed to create download directory") - - # Get appropriate scraper and download - scraper: BasePlaywrightScraper = self.scrapers.get( - parsed_url.host, self.scrapers["default"]) - file_path = await scraper.download_pdf(target_url, storage_path) - return target_url, file_path, storage_path - - except Exception as e: - logger.exception(f"Error in get_pdf: {e}") - return False - - async def get_pdfs(self, - target_urls: list[str], - storage_path: Optional[str] = None) -> Dict[str,Union[int,Dict[str, str],Optional[str]]]: - """Download multiple PDFs concurrently from the provided URLs. - - Args: - target_urls (list[str]): List of URLs to download PDFs from - storage_path (Optional[str], optional): Base path for storing the PDFs. - If not provided, a default path will be generated. Defaults to None. - - Returns: - Dict[str, Union[int, Dict[str, str], Optional[str]]]: A dictionary containing: - - 'count': Number of successfully downloaded PDFs - - 'paths': Mapping of URLs to their local file paths - - 'storage_path': The base storage directory path used - """ - results = {"count": 0, "paths": {}, "storage_path": None} - - storage_path = storage_path + \ - str(datetime.now(tz.utc).strftime("%d_%m_%Y_%H_%M_%S")) if storage_path else None - - # Create tasks for all downloads - tasks = [self.get_pdf(url, storage_path) for url in target_urls] - - # Execute downloads concurrently - async_result = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - for result in async_result: - if isinstance(result, bool): - continue - url, path, storage_path, = result - if path: - results["count"] += 1 - results["paths"][url] = path - results["storage_path"] = storage_path - else: - logger.exception(f"Failed to get pdf from {url}") - - if results["count"] == 0: - logger.warning("No PDFs were successfully downloaded.") - - return results diff --git a/backend/mainService/src/scraper/async_searchApi.py b/backend/mainService/src/scraper/async_searchApi.py deleted file mode 100644 index f4a0264..0000000 --- a/backend/mainService/src/scraper/async_searchApi.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Search API Module - -This module provides functionality for interacting with search APIs to retrieve -and process search results. It includes: - -- Search result retrieval -- Data cleaning and normalization -- Metadata extraction -- Error handling and logging - -The SearchApi class handles all search-related operations, including: -- Initializing HTTP sessions -- Executing search queries -- Cleaning and formatting results -- Managing API rate limits and errors - -Classes: - SearchApi: Main class for handling search API operations -""" - -import os -from urllib.parse import quote_plus -from src.config.async_http_session import AsyncHTTPClient -from src.config.config import search_config -from datetime import datetime, timezone as tz -from src.config.log_config import setup_logging -from typing import Optional - - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class SearchApi: - session = None # Shared session - - @classmethod - async def init_session(cls): - """Initialize the shared aiohttp session""" - if cls.session is None: - cls.session = await AsyncHTTPClient.getSession() - - @classmethod - async def search(cls, query: str, top_n: Optional[int] = None) -> dict: - """Fetch search results asynchronously using a shared session.""" - if not cls.session: - await cls.init_session() - - url = search_config.SEARCH_URL.format( - API_KEY=os.getenv("GPSE_API_KEY"), - CX=os.getenv("CX"), - query=quote_plus(query), - TOP_N=top_n or search_config.TOP_N, - DATE_RESTRICT=search_config.DATE_RESTRICT - ) - - try: - async with cls.session.get(url) as response: - response.raise_for_status() - data = await response.json() - except Exception as e: - logger.critical(f"Error occurred while fetching search results: {str(e)}") - raise e - - # with open("sample_output\\search_results.json", "w") as f: - # json.dump(data, f, indent=4) - - return data - - @classmethod - async def clean_data(cls, data: dict) -> dict: - """Extracts relevant metadata from search results.""" - cleaned_data = {} - links = [] - - for item in data.get("items", []): - pagemap = item.get("pagemap", {}) - metatags = pagemap.get("metatags", [{}])[0] - link = metatags.get("citation_pdf_url") or metatags.get( - "htmlFormattedUrl",) or metatags.get("og:url", "") - - if link: - cleaned_data[link] = cls.clean(metatags) - links.append(link) - - result = {"meta": cleaned_data, "links": links} - - return result - - @classmethod - async def clean_search( - cls, - query: str, - top_n: Optional[int] = None) -> dict: - """Performs search and cleans the data asynchronously.""" - data = await cls.search(query, top_n=top_n) - return await cls.clean_data(data) - - @classmethod - def clean(cls, metatags: dict) -> dict: - """Cleans metadata from search results.""" - field_mappings = { - 'title': [ - 'citation_title', - 'dc.title', - 'og:title'], - 'link': [ - 'citation_pdf_url', - 'htmlFormattedUrl', - 'og:url'], - 'type': [ - 'type', - 'og:type'], - 'publisher': [ - 'dc.publisher', - 'citation_publisher'], - 'journal_title': [ - 'citation_journal_title', - 'citation_conference_title', - 'citation_book_title'], - 'publication_date': [ - 'prism.publicationdate', - 'Updated Date', - 'citation_publication_date'], - 'citation_doi': ['citation_doi'], - 'author_name': [ - 'dc.creator', - 'citation_author'], - 'volume': ['citation_volume'], - 'issn': [ - 'citation_issn', - 'prism.issn'], - 'abstract': [ - 'citation_abstract', - 'dc.description'], - } - # Create result dictionary using dictionary comprehension - result = { - field: next((metatags.get(key) for key in keys if metatags.get(key)), '') - for field, keys in field_mappings.items() - } - - # Add access date separately since it doesn't depend on metatags - result['access_date'] = datetime.now( - tz.utc).strftime("%Y-%m-%d %H:%M:%S") - - # Set default type if none found - if not result['type']: - result['type'] = 'website' - - return result diff --git a/backend/mainService/src/scraper/site_specific/__init__.py b/backend/mainService/src/scraper/site_specific/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py b/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py deleted file mode 100644 index 06a88a9..0000000 --- a/backend/mainService/src/scraper/site_specific/async_frontier_scraper.py +++ /dev/null @@ -1,98 +0,0 @@ -from typing import Union -from src.scraper.async_base_scraper import BasePlaywrightScraper -from urllib.parse import urljoin -from src.utils.web_utils import WebUtils -from src.utils.file_utils import FileUtils -import os -from src.config.log_config import setup_logging -from typing import Optional -from playwright.async_api import Page -from playwright.async_api import TimeoutError -from src.config.config import scraper_config - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class FrontierScraper(BasePlaywrightScraper): - element_timeout = scraper_config.TIMEOUT_DURATION - - async def download_pdf( - self, url: str, download_path: str) -> Union[str, bool]: - try: - logger.info(f"Attempting to download PDF from Frontier: {url}") - download_link = await self._get_download_link(url) - if not download_link: - return False - - if not await self._check_file_size(download_link): - return False - - return await self._handle_download(url=download_link, storage_dir=download_path) - except Exception as e: - logger.error(f"Error in Frontier PDF download: {e}", exc_info=True) - return False - - async def _get_download_link(self, url: str) -> Optional[str]: - page = None - try: - page = await self.context.new_page() - if not url.endswith("pdf"): - await page.goto(url, wait_until='networkidle', timeout=self.element_timeout) - await self._interact_with_dropdown(page) - download_link = await self._extract_download_link(page) - else: - download_link = url - - return download_link - finally: - if page: - await page.close() - - async def _interact_with_dropdown(self, page: Page): - dropdown = page.locator("css=#FloatingButtonsEl > button") - if await dropdown.count() > 0: - try: - await dropdown.click(timeout=self.element_timeout) - logger.info("Successfully clicked dropdown") - except TimeoutError as timeout_e: - logger.warning( - f"Timeout while interacting with dropdown: {timeout_e}") - except Exception as e: - logger.warning( - f"Unexpected error while interacting with dropdown: {e}") - - async def _extract_download_link(self, page: Page) -> str: - download_element = page.get_by_role('link', name="Download PDF") - - if await download_element.count() == 0: - raise ValueError("Element for href extraction not found") - download_link = await download_element.get_attribute('href') - if not download_link: - raise ValueError("Download link attribute is empty") - logger.info("Found download link successfully") - return urljoin(page.url, download_link) - - async def _check_file_size(self, download_link: str) -> bool: - """ - Verify if the file size is within acceptable limits. - - Args: - download_link (str): URL of the file to check - - Returns: - bool: True if file size is acceptable, False otherwise - """ - try: - file_size = WebUtils.get_file_size(download_link) - except Exception as e: - logger.warning(f"Could not check file size: {e}") - return False - - if file_size > FileUtils.MAX_FILE_SIZE: - logger.warning( - f"""File size {file_size} bytes exceeds maximum limit of { - FileUtils.MAX_FILE_SIZE} bytes""") - return False - - return True diff --git a/backend/mainService/src/scraper/site_specific/async_generic_scraper.py b/backend/mainService/src/scraper/site_specific/async_generic_scraper.py deleted file mode 100644 index 426a490..0000000 --- a/backend/mainService/src/scraper/site_specific/async_generic_scraper.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Generic Web Scraper Module - -This module provides a generic web scraper implementation that can handle PDF downloads -from various websites. It includes functionality for: -- PDF detection and download -- Web content extraction -- PDF generation from web content -- Error handling and logging - -The GenericScraper class serves as a fallback when site-specific scrapers are not available. -It uses Playwright for browser automation and ReportLab for PDF generation. - -Classes: - GenericScraper: Implements generic PDF download and generation functionality -""" - -from src.scraper.async_base_scraper import BasePlaywrightScraper -from src.utils.web_utils import WebUtils -from src.utils.file_utils import FileUtils -import os -from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer -from reportlab.lib.styles import getSampleStyleSheet -from reportlab.lib.pagesizes import letter -from reportlab.lib.units import inch -from urllib3.util import parse_url -from src.config.log_config import setup_logging -import asyncio -from src.config.config import scraper_config - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class GenericScraper(BasePlaywrightScraper): - async def download_pdf(self, url: str, download_path: str) -> str | bool: - try: - if not url.lower().endswith('.pdf'): - logger.info("URL does not point to a PDF file") - logger.info(f"Attempting to make a PDF in {download_path}") - return await self.make_pdf(url, download_path) - - if WebUtils.get_file_size(url) > FileUtils.MAX_FILE_SIZE: - logger.warning("File size exceeds limit") - return False - - return await self._handle_download(url=url, storage_dir=download_path) - except Exception as e: - logger.exception(f"Error downloading PDF from generic source: {e}") - return False - - def generate_pdf_sync(self, full_path: str, content: str) -> str: - """ - Generate a PDF from the given content using ReportLab's Platypus. - - :param full_path: The full file path where the PDF will be saved. - :param content: The text content to include in the PDF. - :return: The path to the generated PDF. - """ - # Create a document template with letter page size. - doc = SimpleDocTemplate(full_path, pagesize=letter) - styles = getSampleStyleSheet() - story = [] - - # Create a Paragraph which handles text wrapping. - paragraph = Paragraph(content, styles["Normal"]) - story.append(paragraph) - - # Optionally, add some spacing. - story.append(Spacer(1, 0.2 * inch)) - - # Build the PDF. - doc.build(story) - return full_path - - # Asynchronous method: Generate a PDF from a URL using Playwright for - # content extraction. - async def make_pdf(self, url: str, download_path: str) -> str | bool: - """ - Generate a PDF from a webpage's content. - - :param url: The URL of the webpage. - :param download_path: Directory where the PDF will be saved. - :return: The full path to the saved PDF, or False if an error occurred. - """ - try: - page = await self.PD.get_new_page(self.context) - logger.info("New page created for PDF generation.") - - # Navigate to the URL and wait for DOM content to be loaded (faster - # than waiting for full network idle). - await page.goto(url, wait_until="domcontentloaded", timeout=scraper_config.TIMEOUT_DURATION) - content = await page.locator("body").inner_text() - - await page.close() - - # Parse the URL to create a sensible filename. - parsed = parse_url(url) - base = parsed.path.split('/')[0] or parsed.host - filename = f"{base}.pdf" - full_path = os.path.join(download_path, filename) - - # Offload the synchronous PDF generation to avoid blocking the - # event loop. - loop = asyncio.get_running_loop() - pdf_path = await loop.run_in_executor(None, self.generate_pdf_sync, full_path, content) - logger.info(f"PDF saved to {pdf_path}") - return pdf_path - - except Exception : - logger.exception("Error making PDF") - return False diff --git a/backend/mainService/src/scraper/site_specific/async_ibm_scraper.py b/backend/mainService/src/scraper/site_specific/async_ibm_scraper.py deleted file mode 100644 index 33bf219..0000000 --- a/backend/mainService/src/scraper/site_specific/async_ibm_scraper.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -IBM Research Scraper Module - -This module provides functionality for scraping and downloading PDFs from IBM Research publications. -It extends the BasePlaywrightScraper class to implement site-specific scraping logic for IBM's research portal. - -The scraper handles: -- PDF download link detection -- File size validation -- Error handling and logging -- Playwright-based browser automation - -Classes: - IBMScraper: Implements IBM-specific PDF download functionality -""" - -from src.scraper.async_base_scraper import BasePlaywrightScraper -from src.utils.web_utils import WebUtils -from src.utils.file_utils import FileUtils -from src.config.log_config import setup_logging -from src.config.config import scraper_config -import os - - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class IBMScraper(BasePlaywrightScraper): - element_timeout = scraper_config.TIMEOUT_DURATION - - async def download_pdf(self, url: str, download_path: str) -> str | bool: - page = None - try: - logger.info(f"Attempting to download PDF from IBM: {url}") - - page = await self.context.new_page() - await page.goto(url, wait_until='networkidle') - - # Try multiple strategies to find the download link - download_element = None - try: - # First try specific CSS selector - download_element = page.locator( - '#main-content > article > div > div.aVLxf > header > div > a') - - if not await download_element.count(): - # Try by text content - download_element = page.get_by_role( - 'link', name="Download paper") - - if await download_element.count(): - download_link = await download_element.get_attribute('href') - if not download_link: - raise ValueError("Download link attribute is empty") - - logger.info(f"Found download link: {download_link}") - - # Check file size if needed - try: - # Implement file size check using Playwright - size = WebUtils.get_file_size(download_link) - if size > FileUtils.MAX_FILE_SIZE: - logger.warning(f"File size {size} exceeds maximum limit") - return False - except Exception as e: - logger.warning(f"Could not check file size: {e}") - - # Close page before starting download - await page.close() - page = None - - # Handle the download - return await self._handle_download(url=download_link, storage_dir=download_path) - else: - raise ValueError("No download element found") - - except Exception: - logger.exception("Failed to get download link") - return False - - except Exception: - logger.exception("Error in IBM PDF download") - return False - - finally: - if page: - await page.close() diff --git a/backend/mainService/src/services/__init__.py b/backend/mainService/src/services/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/mainService/src/services/citation_service.py b/backend/mainService/src/services/citation_service.py deleted file mode 100644 index cbd70be..0000000 --- a/backend/mainService/src/services/citation_service.py +++ /dev/null @@ -1,425 +0,0 @@ -from typing import Dict, List, Any, Optional -from src.scraper.async_searchApi import SearchApi -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.scraper.async_content_scraper import AsyncContentScraper as ACS -from src.llm.Async_prepare_data_for_embedding import create_batches, chunk_text, create_batches_from_doc -import asyncio -import os -from src.llm.Pinecone import PineconeOperations -from src.utils.format_rerank_result import filter_mixbread_results -from src.config.log_config import setup_logging -from src.llm.chat_llm.Azure_llm import Citation -from src.config.config import LlmConfig as LLMEC -from src.config.config import search_config,scraper_config -from src.custom_exceptions.llm_exceptions import CitationGenerationError -from src.llm.embedding_utils.reranker import rerank, format_for_rerank -from langchain_core.documents import Document -from src.services.source_credibility_metric_service import get_credibility_metrics, calculate_overall_score -from src.models.schema import Source -from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type - -log_filename = os.path.basename(__file__) -logger = setup_logging(filename=log_filename) - - -class CitationService: - """ - A service class that handles the end-to-end process of generating citations for given content. - - This class orchestrates the citation generation process including document search, content scraping, - document processing, vector indexing, and citation formatting. It utilizes multiple external services - including search APIs, content scrapers, and language models. - - Attributes: - PC (PineconeOperations): Instance for vector database operations - scraper_driver (PlaywrightDriver): Browser automation driver for content scraping - summarize_llm (Summarize_llm): Language model for content summarization - citation_llm (Citation): Language model for citation generation - - Methods: - process_single_query: Process a single search query - process_queries: Handle multiple queries concurrently - process_citation: Main method for generating citations - """ - - def __init__( - self, - PC: PineconeOperations, - scraper: ACS, - summarize_llm: Summarize_llm, - citation_llm: Citation): - self.PC = PC - self.summarize_llm = summarize_llm - self.citation_llm = citation_llm - self.scraper = scraper - - async def process_single_query(self, query: str) -> Dict[str, Any]: - """ - Process a single query and return the results. - We query the pinecone index with the query and get the top 5 results. - We then rerank the results using the llm and return the top result. - - Args: - query (str): The query to process - - Returns: - Dict[str, Any]: The processed results - - """ - logger.info(f"Processing query {query[:15]}") - search_results = await self.PC.hybrid_query(query=query, top_k=5) - formatted_results = format_for_rerank(search_results['matches']) - reranked_results = await rerank(matches=formatted_results, query=query, top_n=1, rank_fields=["page_content"]) - return reranked_results - - @retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=4, max=10), - retry=retry_if_exception_type(Exception) - ) - async def process_queries( - self, queries: List[str]) -> List[Dict[str, Any]]: - """ - Process multiple queries concurrently. - - Args: - queries (List[str]): List of queries to process - - Returns: - List[Dict[str, Any]]: List of processed results - - """ - logger.info("Processing queries concurrently") - try: - # Create tasks for all queries - tasks = [self.process_single_query(query) for query in queries] - - # Process all queries concurrently - results = await asyncio.gather(*tasks) - return results - - except Exception as e: - logger.exception(f"Error in batch query processing: {str(e)}") - raise CitationGenerationError( - "Failed to process queries after multiple retries") from e - - async def process_citation(self, - title: str, - content: str, - form_type: str, - style: str = "APA", - sources: List[Dict] = None, - supplement_urls: bool = False) -> Dict[str, - Any] | False: - """ - Main orchestrator for citation generation process. - - Args: - title (str): Title of the content - content (str): Content to generate citations for - style (str, optional): Citation style. Defaults to "APA". - - Returns: - Dict[str, Any] | False: Citation results or False if error occurs - - """ - try: - # Step 0: Generate index name - title = self.summarize_llm.getKeywordSearchTerm(content, proposed_title=title) - index_name = self._generate_index_name(title) - logger.info(f"index_name = {index_name}") - if await self.PC.set_current_index(index_name): - logger.info(f"Index set to {index_name}") - return await self._generate_citations(content, style) - - # Step 1: Get sources - processed_docs = None - if form_type == "auto": - search_results = await self._get_search_results(title) - processed_docs = await self._process_documents(search_results) - elif form_type == "web": - processed_docs = await self.process_web_sources(title=title, sources=sources, supplement_urls=supplement_urls) - elif form_type == "source": - processed_docs = await self.process_direct_sources(sources) - - # Step 2: Create and populate index - index_success = await self._create_and_populate_index(processed_docs, index_name=index_name) - if not index_success: - return False - - # Step 3: Generate citations - return await self._generate_citations(content, style) - - except Exception as e: - logger.exception(f"Error in citation process: {str(e)}") - return False - - async def process_web_sources(self, - sources: List[Source], - supplement_urls: bool, - title: str) -> Dict[str, - Any]: - """Handle web form sources with optional supplementary URLs""" - max_sources = search_config.TOP_N - if supplement_urls: - remaining_slots = max_sources - len(sources) - logger.info(f"fetching {remaining_slots} additional sources") - additional_results = await self._get_search_results(search_key=title, top_n=remaining_slots) - sources_dict = additional_results.copy() - for item in sources: - key = item.url - sources_dict["cleaned_result"]["meta"][key] = item.model_dump() - sources_dict["cleaned_result"]["links"].append(key) - - return await self._process_documents(sources_dict) - - async def process_direct_sources( - self, sources: List[Source]) -> Dict[str, Any]: - """Handle direct source content without searching""" - - sources_as_docs = [ - Document( - page_content=source.content, - metadata={key: value for key, value in source.model_dump().items() if key != "content" and value is not None} - ) - for source in sources - ] - - # Await batch creation for efficient processing - batches = await create_batches_from_doc(sources_as_docs, LLMEC.BATCH_SIZE) - - return {"batches": batches} - - async def _get_search_results( - self, search_key: str, top_n: Optional[int] = None) -> Dict[str, Any] | False: - """ - Get search terms and perform initial search. - Here we use the google search_api to find sources that are relevant to the content. - - Args: - search_key (str): The keyword term used by the Google Search Api to find relevant source. - Returns: - Dict[str, Any] | False: Search results dictionary or False if error occurs - """ - - try: - - cleaned_result = await SearchApi.clean_search(search_key, top_n=top_n) - return {"search_key": search_key, "cleaned_result": cleaned_result} - except Exception as e: - logger.exception(f"Error getting search results: {str(e)}") - return False - - async def _process_documents( - self, search_results: Dict[str, Any]) -> Dict[str, Any] | False: - """ - Process and download documents from search results. - - Args: - search_results (Dict[str, Any]): Results from the search operation - - Returns: - Dict[str, Any] | False: Processed document batches or False if error occurs - """ - - try: - cleaned_result = search_results["cleaned_result"] - async with asyncio.timeout((scraper_config.TIMEOUT_DURATION*2)/1000): # 20 second timeout - download_results = await self.scraper.get_pdfs( - target_urls=cleaned_result.get("links"), - storage_path=search_results["search_key"] - ) - - return await self._prepare_document_batches( - download_results, - cleaned_result["meta"] - ) - except Exception as e: - logger.exception(f"Error processing documents: {str(e)}") - return False - - async def _prepare_document_batches( - self, - download_results: Dict[str, Any], - metadata: Dict[str, Any] - ) -> Dict[str, Any]: - """ - Prepare document batches for processing. - - Args: - download_results (Dict[str, Any]): Results from the download operation - metadata (Dict[str, Any]): Metadata for the documents - - Returns: - Dict[str, Any]: Document batches and storage path - - """ - filtered_results = {} - - # Update metadata with file paths - for url, meta in metadata.items(): - if url in download_results["paths"]: - meta["file_path"] = download_results["paths"][url] - filtered_results[url] = meta - - # Create document batches - batches = create_batches( - download_results["storage_path"], - filtered_results, - LLMEC.BATCH_SIZE - ) - - return { - "batches": batches, - "storage_path": download_results["storage_path"] - } - - async def _create_and_populate_index( - self, - processed_docs: Dict[str, Any], - index_name: str - ) -> bool: - """ - Create and populate the search index. - - Args: - processed_docs (Dict[str, Any]): Processed document batches - index_name (str): Name of the index - - Returns: - bool: True if index creation and population is successful, False otherwise - - """ - try: - # Create index - index = await self.PC.create_index(index_name=index_name) - if not index: - logger.exception("Index creation failed") - return False - - # Populate index - return await self._populate_index(processed_docs["batches"]) - except Exception as e: - logger.exception(f"Error creating/populating index: {e}") - return False - - def _generate_index_name(self, search_key: str) -> str: - """ - Generate a valid index name. - - Args: - search_key (str): Search key for the index - Returns: - str: Valid index name - - """ - return (search_key.strip()[:LLMEC.INDEX_NAME_LEN] - .replace(" ", "-") - .lower() + "a") - - async def _populate_index(self, batches: List[Any]) -> bool: - """ - Populate the index with document batches. - - Args: - batches (List[Any]): List of document batches - Returns: - bool: True if index population is successful, False otherwise - - """ - try: - initial_count = current_count = await self.PC.get_idx_stat() - await self.PC.upsert_documents(batches=batches) - - # Wait for documents to be indexed - while current_count < initial_count + len(batches): - await asyncio.sleep(1) - current_count = await self.PC.get_idx_stat() - logger.info("Upserting document to pinecone...") - - return current_count >= initial_count + len(batches) - except Exception as e: - logger.exception(f"Error populating index: {str(e)}") - return False - - async def _generate_citations( - self, content: str, style: str) -> Dict[str, Any] | False: - """Generate citations from processed content. - - Args: - content (str): Processed content - style (str): Citation style - - Returns: - Dict[str, Any] | False: Citation results or False if error occurs - - - """ - try: - queries = chunk_text( - content, - max_tokens=LLMEC.QUERY_TOKEN_SIZE, - overlap_percent=5 - ) - # RAG + Rerank - results = await self.process_queries(queries) - filtered_results = filter_mixbread_results(results) - - sources_with_scores = [ - { - "title": result.get("title", ""), - "link": result.get("link", "") or result.get("url", ""), - "domain": result.get("domain", ""), - "journal": result.get("journal_title", ""), - "citation_doi": result.get("citation_doi", ""), - "citation_references": result.get("references", [""]), - "publication_date": result.get("publication_date", ""), - "author_name": result.get("author_name", "") or result.get("author", "") or result.get("authors", ""), - "abstract": result.get("abstract", ""), - "issn": result.get("issn", ""), - "type": result.get("type", ""), - "rerank_score": result.get("score", 0) - } for result in filtered_results - ] - - credibility_task = get_credibility_metrics(sources_with_scores) - citation_task = Citation(source=filtered_results).cite( - text=queries, - citation_style=style - ) - - # Start both tasks but handle credibility metrics first - credibility_metrics = await asyncio.gather(credibility_task, return_exceptions=True) - - if isinstance(credibility_metrics[0], Exception): - logger.exception(f"Credibility metrics failed: {str(credibility_metrics[0])}") - credibility_metrics = [] - else: - credibility_metrics = credibility_metrics[0] - - # Calculate scores immediately after getting credibility metrics - scores = await calculate_overall_score(credibility_metrics, sources_with_scores, - rerank_weight=0.6, credibility_weight=0.4) - - sources = [ - item["data"] for item in credibility_metrics if item["status"] == "success" - ] if credibility_metrics else [] - - citation_result = await citation_task - if isinstance(citation_result, Exception): - logger.exception(f"Citation generation failed: {str(citation_result)}") - raise CitationGenerationError("Failed to generate citations") - - return { - "result": citation_result, - "overall_score": scores["overall_score"], - "sources": sources - } - - except CitationGenerationError as e: - logger.exception(f"Error generating citation: {e}") - return False - except Exception as e: - logger.exception(f"Unexpected error in citation generation: {str(e)}") - return False - diff --git a/backend/mainService/src/services/source_credibility_metric_service.py b/backend/mainService/src/services/source_credibility_metric_service.py deleted file mode 100644 index 7ccd072..0000000 --- a/backend/mainService/src/services/source_credibility_metric_service.py +++ /dev/null @@ -1,126 +0,0 @@ -from typing import List, Dict, Any -import aiohttp -from src.config.log_config import setup_logging -import os -from functools import partial -import asyncio -from src.config.config import concurrency_config -from src.utils.concurrent_resources import credibility_executor, credibility_semaphore - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - -def _calculate_source_score(metric: Dict, source: Dict, - rerank_weight: float, credibility_weight: float) -> tuple[float, Dict]: - """ - Calculate weighted score for a single source in a separate thread. - Uses semaphore to limit concurrent calculations. - - Args: - metric (Dict): Credibility metric for the source - source (Dict): Source with rerank score - rerank_weight (float): Weight for rerank score - credibility_weight (float): Weight for credibility score - - Returns: - tuple[float, Dict]: Tuple of (weighted_score, updated_metric) - """ - with credibility_semaphore: - if metric["status"] != "success": - return 0.00, metric - - credibility_score = metric["data"]["credibility_score"] - rerank_score = source["rerank_score"] - - # Normalize rerank score to 0-1 range - normalized_rerank = min(max(rerank_score, 0), 1)* 100 - - # Calculate weighted score and normalize to 0-100 range - weighted_score = round((normalized_rerank * rerank_weight + - credibility_score * credibility_weight) , 2) - - # Update the credibility score in the metric data - metric["data"]["credibility_score"] = weighted_score - return weighted_score, metric - -async def get_credibility_metrics(sources: List[Dict]) -> List[Dict]: - """ - Call the credibility API to get metrics for sources. - Uses timeout handling for better reliability. - """ - credibility_metrics_api = os.getenv('CREDIBILITY_API_URL','') - if not credibility_metrics_api: - logger.error("CREDIBILITY_API_URL is not set") - return [] - - # Configure timeout - timeout = aiohttp.ClientTimeout(total=20) - - try: - async with aiohttp.ClientSession(timeout=timeout) as session: - async with session.post( - credibility_metrics_api, - json={'sources': sources}, - headers={'Content-Type': 'application/json'} - ) as response: - if response.status == 200: - return await response.json() - else: - logger.error(f"Credibility API error: {response.status}") - return [] - except asyncio.TimeoutError: - logger.error("Credibility API request timed out") - return [] - except Exception: - logger.exception("Error calling credibility API") - return [] - -async def calculate_overall_score(credibility_metrics: List[Dict], sources_with_scores: List[Dict], - rerank_weight: float = 0.6, credibility_weight: float = 0.4) -> Dict[str, Any]: - """ - Calculate weighted scores for each source and overall mean score using parallel processing. - Uses configured thread pool and semaphore for concurrent calculations. - - Args: - credibility_metrics (List[Dict]): List of credibility metrics for each source - sources_with_scores (List[Dict]): List of sources with their rerank scores - rerank_weight (float): Weight for rerank score (default 0.6) - credibility_weight (float): Weight for credibility score (default 0.4) - - Returns: - Dict[str, Any]: Dictionary containing source scores and overall mean score - """ - if not credibility_metrics or not sources_with_scores: - return {"overall_score": 0.00, "source_scores": []} - - try: - calculate_score = partial(_calculate_source_score, - rerank_weight=rerank_weight, - credibility_weight=credibility_weight) - - # Process in batches using configured size - source_scores = [] - for i in range(0, len(sources_with_scores), concurrency_config.CREDIBILITY_BATCH_SIZE): - batch_metrics = credibility_metrics[i:i + concurrency_config.CREDIBILITY_BATCH_SIZE] - batch_sources = sources_with_scores[i:i + concurrency_config.CREDIBILITY_BATCH_SIZE] - - # Calculate batch scores - batch_results = list(credibility_executor.map( - lambda x: calculate_score(x[0], x[1]), - zip(batch_metrics, batch_sources) - )) - - scores, updated_metrics = zip(*batch_results) if batch_results else ([], []) - source_scores.extend(scores) - credibility_metrics[i:i + concurrency_config.CREDIBILITY_BATCH_SIZE] = updated_metrics - - overall_mean = round(sum(source_scores) / len(source_scores), 2) if source_scores else 0.00 - - return { - "overall_score": overall_mean, - "source_scores": source_scores - } - - except Exception as e: - logger.exception(f"Error in score calculation: {str(e)}") - return {"overall_score": 0.00, "source_scores": []} diff --git a/backend/mainService/src/utils/concurrent_resources.py b/backend/mainService/src/utils/concurrent_resources.py deleted file mode 100644 index 0d33028..0000000 --- a/backend/mainService/src/utils/concurrent_resources.py +++ /dev/null @@ -1,15 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from threading import Semaphore -from src.config.config import concurrency_config - -# Create thread pool for credibility calculations -credibility_executor = ThreadPoolExecutor( - max_workers=concurrency_config.CREDIBILITY_MAX_THREADS -) - -# Create semaphore for limiting concurrent operations -credibility_semaphore = Semaphore(concurrency_config.CREDIBILITY_MAX_CONCURRENT) - -def cleanup_resources(): - """Cleanup all concurrent resources""" - credibility_executor.shutdown(wait=True) diff --git a/backend/mainService/src/utils/file_utils.py b/backend/mainService/src/utils/file_utils.py deleted file mode 100644 index 44e1aff..0000000 --- a/backend/mainService/src/utils/file_utils.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -from src.config.log_config import setup_logging - -filename = os.path.basename(__file__) -logger = setup_logging(filename=filename) - - -class FileUtils: - """ - A utility class providing static methods for common file operations. - - Class Attributes: - ONE_MB (int): Constant representing one megabyte in bytes (1024 * 1024) - MAX_FILE_SIZE (int): Maximum allowed file size (5 MB) - """ - - ONE_MB = 1024 * 1024 - MAX_FILE_SIZE = 5 * ONE_MB - - @staticmethod - def check_file_exists(url: str, target_directory: str) -> str | bool: - """ - Check if a file exists in the specified directory based on the URL. - - Args: - url (str): The URL from which to extract the filename - target_directory (str): The directory path to check for the file - - Returns: - str | bool: Full path to the file if it exists, False otherwise or on error - """ - - try: - filename = url.rstrip('/').split('/')[-1] - full_path = os.path.join(target_directory, filename) - return full_path if os.path.exists(full_path) else False - except Exception as e: - logger.exception(f"Error checking file existence: {e}") - return False - - @staticmethod - def ensure_directory(path: str) -> bool: - """ - Ensure that a directory exists, creating it if necessary. - - Args: - path (str): The directory path to create or verify - - Returns: - bool: True if the directory exists or was created successfully, - False if an error occurred - - """ - - try: - os.makedirs(path, exist_ok=True) - return True - except Exception as e: - logger.exception(f"Error creating directory {path}: {e}") - return False diff --git a/backend/mainService/src/utils/format_rerank_result.py b/backend/mainService/src/utils/format_rerank_result.py deleted file mode 100644 index 2301dd4..0000000 --- a/backend/mainService/src/utils/format_rerank_result.py +++ /dev/null @@ -1,118 +0,0 @@ -from fastapi import FastAPI -from pydantic import BaseModel -from typing import List, Dict, Any -import json -from mixedbread_ai.client import RerankingResponse -app = FastAPI() - - -class DocumentMetadata(BaseModel): - access_date: str - author: str - doi: str - file_path: str - journal_title: str - link: str - page_content: str - publication_date: str - publisher: str - title: str - type: str - volume: str - - -class Document(BaseModel): - id: str - metadata: DocumentMetadata - page_content: str - - -class RerankedResult(BaseModel): - index: int - score: float - document: Document - - -class RerankResponse(BaseModel): - model: str - data: List[RerankedResult] - - -def convert_rerank_result(rerank_result: RerankResponse) -> Dict[str, Any]: - """ - Convert a rerank response object to a dictionary format. - - Args: - rerank_result (RerankResponse): The rerank response object to convert - - Returns: - Dict[str, Any]: A dictionary containing the model and data from the rerank result - """ - - return { - "model": rerank_result.model, - "data": [ - { - "index": item.index, - "score": item.score, - "document": { - "id": item.document["id"], - "metadata": item.document["metadata"], - "page_content": item.document["page_content"] - } - } - for item in rerank_result.data - ] - } - - -def filter_results( - rerank_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - """ - Filter pinecone rerank results based on score threshold and remove duplicates. - - Args: - rerank_results (List[Dict[str, Any]]): List of rerank results to filter - - Returns: - List[Dict[str, Any]]: Filtered list of unique document metadata meeting the score threshold - """ - - unique_results = [] - seen_ids = set() - for result in rerank_results: - doc = result.data[0] - benchmark = 0.6 - if doc.document.id not in seen_ids and doc.score >= benchmark: - unique_results.append(doc.document.metadata) - with open("sample_output\\rerank_result.json", "w") as f: - json.dump(unique_results, f, indent=4) - seen_ids.add(doc.document.id) - return unique_results - - -def filter_mixbread_results( - rerank_results: RerankingResponse) -> List[Dict[str, Any]]: - """ - Filter Mixbread reranking results based on score threshold and remove duplicates. - - Args: - rerank_results (RerankingResponse): Reranking response from Mixbread - - Returns: - List[Dict[str, Any]]: Filtered list of unique document metadata meeting the score threshold - """ - - unique_results = [] - seen_ids = set() - for result in rerank_results: - benchmark = 0.6 - result = result.data[0] - doc: dict = result.input.get("metadata") - if doc.get("id") not in seen_ids and result.score >= benchmark: - seen_ids.add(doc.pop("id")) - doc["score"] = result.score - unique_results.append(doc) - # with open("sample_output\\rerank_result_mixbread.json", "a") as f: - # json.dump(unique_results, f, indent=4) - return unique_results diff --git a/backend/mainService/src/utils/web_utils.py b/backend/mainService/src/utils/web_utils.py deleted file mode 100644 index 8a133a7..0000000 --- a/backend/mainService/src/utils/web_utils.py +++ /dev/null @@ -1,78 +0,0 @@ -import requests -import random -from protego import Protego -import logging - - -class WebUtils: - """ - A utility class providing static methods for web-related operations. - - This class includes functionality for checking robots.txt rules and retrieving file sizes - from URLs. It implements proper web crawling etiquette by respecting robots.txt directives - and implementing appropriate delays between requests. - """ - @staticmethod - def check_robots_txt(base_url, target_url, user_agent): - """ - Checks robots.txt for crawl permissions and delay. - - Args: - base_url: The base URL of the website. - target_url: The URL to check against robots.txt. - user_agent: The user agent string. - - Returns: - A tuple (can_fetch, request_delay). - can_fetch: True if allowed to crawl, False otherwise. - request_delay: Delay in seconds, or -1 on error. - """ - try: - rb_txt = requests.get(f"{base_url}/robots.txt") - if rb_txt.status_code == 404: # robots.txt not found. - return True, 0 # allow crawling, no delay. - # Raise HTTPError for bad responses (4xx or 5xx) other than 404 - rb_txt.raise_for_status() - - rp = Protego.parse(rb_txt.text) - can_fetch = rp.can_fetch(target_url, user_agent) - crawl_delay = rp.crawl_delay(user_agent) or 0 - request_delay = random.uniform(crawl_delay, crawl_delay + 3) - - return can_fetch, request_delay - - except requests.exceptions.RequestException as e: - logging.error(f"Error fetching robots.txt: {e}") - return True, 0 # allow crawling, no delay. - except Exception as e: - logging.error(f"Error processing robots.txt: {e}") - return False, -1 - - @staticmethod - def get_file_size(url: str) -> int: - """ - Retrieve the size of a file at the specified URL using a HEAD request. - - Args: - url (str): The URL of the file to check - - Returns: - int: The size of the file in bytes, or -1 if the size cannot be determined - or an error occurs - - Raises: - Exception: Logs any errors that occur during the size check and returns -1 - - Note: - This method uses HEAD requests to minimize bandwidth usage when checking file sizes. - The size is logged in megabytes for convenience but returned in bytes. - """ - - try: - response = requests.head(url, allow_redirects=True) - size = int(response.headers.get('Content-Length', -1)) - logging.info(f"File size: {size / (1024 * 1024):.2f} MB") - return size - except Exception as e: - logging.error(f"Error getting file size: {e}") - return -1 diff --git a/backend/mainService/test/conftest.py b/backend/mainService/test/conftest.py deleted file mode 100644 index ef656ae..0000000 --- a/backend/mainService/test/conftest.py +++ /dev/null @@ -1,50 +0,0 @@ -import pytest -from fastapi.testclient import TestClient -from fastapi import FastAPI -from app import app -from unittest.mock import AsyncMock, MagicMock -from dotenv import load_dotenv -import nltk -from nltk.data import find - -load_dotenv() -try: - find('tokenizers/punkt') - find('punkt_tab') - -except LookupError as e: - nltk.download('punkt') - nltk.download('punkt_tab') - -@pytest.fixture -def test_client(): - return TestClient(app) - -@pytest.fixture -def mock_pinecone(): - mock_pc = AsyncMock() - mock_pc.cleanup = AsyncMock() - return mock_pc - -@pytest.fixture -def mock_summarize_llm(): - return MagicMock() - -@pytest.fixture -def mock_citation_llm(): - return MagicMock() - -@pytest.fixture -def mock_scraper(): - mock_scraper = AsyncMock() - mock_scraper.__aenter__.return_value = mock_scraper - mock_scraper.__aexit__.return_value = None - return mock_scraper - -@pytest.fixture -def mock_playwright_driver(): - mock_driver = AsyncMock() - mock_driver.quit = AsyncMock() - return mock_driver - -pytest_plugins = ['pytest_asyncio'] \ No newline at end of file diff --git a/backend/mainService/test/custom_exceptions/test_llm_exceptions.py b/backend/mainService/test/custom_exceptions/test_llm_exceptions.py deleted file mode 100644 index c686e5b..0000000 --- a/backend/mainService/test/custom_exceptions/test_llm_exceptions.py +++ /dev/null @@ -1,16 +0,0 @@ -import pytest -from src.custom_exceptions.llm_exceptions import SearchKeyGenerationError, CitationGenerationError - -def test_search_key_generation_error(): - error_message = "Failed to generate search key" - error = SearchKeyGenerationError(error_message) - - assert str(error) == error_message - assert isinstance(error, Exception) - -def test_citation_generation_error(): - error_message = "Error processing LLM request" - error = CitationGenerationError(error_message) - - assert str(error) == error_message - assert isinstance(error, Exception) \ No newline at end of file diff --git a/backend/mainService/test/test_citation_controller.py b/backend/mainService/test/test_citation_controller.py deleted file mode 100644 index 8ae2342..0000000 --- a/backend/mainService/test/test_citation_controller.py +++ /dev/null @@ -1,91 +0,0 @@ -import pytest -from fastapi.testclient import TestClient -from unittest.mock import AsyncMock, patch -from src.models.schema import CitationInput - -def test_get_citation_auto(test_client, mock_pinecone, mock_summarize_llm, mock_citation_llm, mock_scraper): - # Mock the app state - test_client.app.state.pc = mock_pinecone - test_client.app.state.summarize_llm = mock_summarize_llm - test_client.app.state.citation_llm = mock_citation_llm - test_client.app.state.async_content_scraper = mock_scraper - - # Test data - test_payload = { - "title": "Test Title", - "content": "Test content", - "citationStyle": "APA", - "formType": "auto" - } - - # Mock the service response - mock_response = { - "citations": ["Test Citation"], - "metadata": {"source": "test"} - } - - with patch('src.services.citation_service.CitationService.process_citation', - new_callable=AsyncMock) as mock_process: - mock_process.return_value = mock_response - - response = test_client.post("/citation/get_citation", json=test_payload) - - assert response.status_code == 200 - assert response.json() == mock_response - -def test_get_citation_web_with_invalid_sources(test_client, mock_pinecone, mock_summarize_llm, mock_citation_llm, mock_scraper): - # Mock the app state - test_client.app.state.pc = mock_pinecone - test_client.app.state.summarize_llm = mock_summarize_llm - test_client.app.state.citation_llm = mock_citation_llm - test_client.app.state.async_content_scraper = mock_scraper - - # Test data - test_payload = { - "title": "Test Title", - "content": "Test content", - "citationStyle": "APA", - "formType": "web", - "sources": ["source1", "source2"], - "supplementUrls": ["url1", "url2"] - } - - # Mock the service response - mock_response = { - "citations": ["Test Citation"], - "metadata": {"source": "test"} - } - - with patch('src.services.citation_service.CitationService.process_citation', - new_callable=AsyncMock) as mock_process: - mock_process.return_value = mock_response - - response = test_client.post("/citation/get_citation", json=test_payload) - - assert response.status_code == 422 - - -def test_get_citation_invalid_form_type(test_client): - # Test data with invalid form type - test_payload = { - "title": "Test Title", - "content": "Test content", - "citationStyle": "APA", - "formType": "invalid" - } - - response = test_client.post("/citation/get_citation", json=test_payload) - - assert response.status_code == 422 - -def test_get_citation_missing_title(test_client): - # Test data without title - test_payload = { - "content": "Test content", - "citationStyle": "APA", - "formType": "auto" - } - - response = test_client.post("/citation/get_citation", json=test_payload) - - assert response.status_code == 422 \ No newline at end of file diff --git a/backend/mainService/test/test_citation_service.py b/backend/mainService/test/test_citation_service.py deleted file mode 100644 index 1080487..0000000 --- a/backend/mainService/test/test_citation_service.py +++ /dev/null @@ -1,1019 +0,0 @@ -import pytest -from unittest.mock import AsyncMock, MagicMock, patch -from src.services.citation_service import CitationService -from src.llm.Pinecone import PineconeOperations -from src.scraper.async_content_scraper import AsyncContentScraper -from src.llm.chat_llm.Groq_llm import Summarize_llm -from src.llm.chat_llm.Azure_llm import Citation -from src.custom_exceptions.llm_exceptions import CitationGenerationError -from src.models.schema import Source -from src.scraper.async_searchApi import SearchApi -from src.llm.embedding_utils.reranker import rerank -from src.utils.format_rerank_result import filter_mixbread_results -from src.services.source_credibility_metric_service import get_credibility_metrics, calculate_overall_score - -@pytest.fixture -def mock_pinecone(): - mock_pc = AsyncMock(spec=PineconeOperations) - mock_pc.hybrid_query = AsyncMock() - mock_pc.create_index = AsyncMock(return_value=True) - mock_pc.get_idx_stat = AsyncMock(return_value=0) - mock_pc.upsert_documents = AsyncMock() - mock_pc.set_current_index = AsyncMock(return_value=False) - return mock_pc - -@pytest.fixture -def mock_scraper(): - mock_scraper = AsyncMock(spec=AsyncContentScraper) - mock_scraper.get_pdfs = AsyncMock() - return mock_scraper - -@pytest.fixture -def mock_summarize_llm(): - mock_llm = MagicMock(spec=Summarize_llm) - mock_llm.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - return mock_llm - -@pytest.fixture -def mock_citation_llm(): - mock_llm = MagicMock(spec=Citation) - mock_llm.cite = AsyncMock(return_value=["Test Citation"]) - return mock_llm - -@pytest.fixture -def citation_service(mock_pinecone, mock_scraper, mock_summarize_llm, mock_citation_llm): - return CitationService( - PC=mock_pinecone, - scraper=mock_scraper, - summarize_llm=mock_summarize_llm, - citation_llm=mock_citation_llm - ) - -@pytest.mark.asyncio -@patch('src.services.citation_service.rerank') -async def test_process_single_query_success(mock_rerank, citation_service, mock_pinecone): - # Arrange - query = "test query" - mock_pinecone.hybrid_query.return_value = { - "matches": [ - {"id": "1", "score": 0.9, "metadata": {"content": "test content"}} - ] - } - mock_rerank.return_value = [{"id": "1", "score": 0.95, "content": "test content"}] - - # Act - result = await citation_service.process_single_query(query) - - # Assert - assert result is not None - mock_pinecone.hybrid_query.assert_called_once_with(query=query, top_k=5) - mock_rerank.assert_called_once() - -@pytest.mark.asyncio -@patch('src.services.citation_service.rerank') -async def test_process_queries_success(mock_rerank, citation_service, mock_pinecone): - # Arrange - queries = ["query1", "query2"] - mock_pinecone.hybrid_query.side_effect = [ - {"matches": [{"id": "1", "score": 0.9}]}, - {"matches": [{"id": "2", "score": 0.8}]} - ] - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - - # Act - results = await citation_service.process_queries(queries) - - # Assert - assert len(results) == 2 - assert mock_pinecone.hybrid_query.call_count == 2 - assert mock_rerank.call_count == 2 - -@pytest.mark.asyncio -@patch('src.services.citation_service.rerank') -async def test_process_queries_retry(mock_rerank, citation_service, mock_pinecone): - # Arrange - queries = ["query1"] - mock_pinecone.hybrid_query.side_effect = [ - Exception("First attempt fails"), - {"matches": [{"id": "1", "score": 0.9}]} - ] - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - - # Act - results = await citation_service.process_queries(queries) - - # Assert - assert len(results) == 1 - assert mock_pinecone.hybrid_query.call_count == 2 - assert mock_rerank.call_count == 1 - -# @pytest.mark.skip(reason="Skipping test due to making actual API calls") -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_auto_success( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service, - mock_scraper -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Scraper - mock_scraper.get_pdfs.return_value = { - "paths": {"url1": "path1"}, - "storage_path": "test_path" - } - - # Mock Citation LLM - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(return_value=["Test Citation"]) - mock_citation_class.return_value = mock_citation_instance - - # Mock Summarize LLM - mock_summarize_instance = MagicMock() - mock_summarize_instance.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - mock_genai_cite.merge_citation.return_value = ["Test Citation"] - - # Replace the citation_service's LLM instances with our mocks - citation_service.citation_llm = mock_citation_instance - citation_service.summarize_llm = mock_summarize_instance - - # Mock rerank and filter results - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - mock_filter_mixbread_results.return_value = [{"id": "1", "score": 0.95}] - - # Mock credibility metrics - mock_get_credibility_metrics.return_value = [{"status": "success", "data": {"title": "Test Source"}}] - mock_calculate_overall_score.return_value = { - "overall_score": 84.00, - "source_scores": [84.00] - } - - # Mock document processing - mock_doc = MagicMock() - mock_doc.page_content = "Test content" - mock_doc.metadata = {"source": "path1"} - mock_load_document.return_value = [mock_doc] - mock_split_document.return_value = [mock_doc] - mock_append_metadata.return_value = [mock_doc] - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is not None - assert "result" in result - assert "overall_score" in result - assert "sources" in result - assert result["result"] == ["Test Citation"] - assert result["overall_score"] == 84.00 - assert len(result["sources"]) == 1 - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_web_success( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service, - mock_scraper -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - sources = [ - Source( - url="http://test.com", - title="Test Source", - content="Test source content", - authors="Test Author" - ) - ] - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Scraper - mock_scraper.get_pdfs.return_value = { - "paths": {"url1": "path1"}, - "storage_path": "test_path" - } - - # Mock Citation LLM - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(return_value=["Test Citation"]) - mock_citation_class.return_value = mock_citation_instance - - # Mock Summarize LLM - mock_summarize_instance = MagicMock() - mock_summarize_instance.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - mock_genai_cite.merge_citation.return_value = ["Test Citation"] - - # Replace the citation_service's LLM instances with our mocks - citation_service.citation_llm = mock_citation_instance - citation_service.summarize_llm = mock_summarize_instance - - # Mock rerank and filter results - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - mock_filter_mixbread_results.return_value = [{"id": "1", "score": 0.95}] - - # Mock credibility metrics - mock_get_credibility_metrics.return_value = [{"status": "success", "data": {"title": "Test Source"}}] - mock_calculate_overall_score.return_value = { - "overall_score": 84.00, - "source_scores": [84.00] - } - - # Mock document processing - mock_doc = MagicMock() - mock_doc.page_content = "Test content" - mock_doc.metadata = {"source": "path1"} - mock_load_document.return_value = [mock_doc] - mock_split_document.return_value = [mock_doc] - mock_append_metadata.return_value = [mock_doc] - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="web", - style=style, - sources=sources, - supplement_urls=True - ) - - # Assert - assert result is not None - assert "result" in result - assert "overall_score" in result - assert "sources" in result - assert result["result"] == ["Test Citation"] - assert result["overall_score"] == 84.00 - assert len(result["sources"]) == 1 - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_source_success( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - sources = [ - Source( - url="http://test.com", - title="Test Source", - content="Test source content", - authors="Test Author" - ) - ] - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Citation LLM - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(return_value=["Test Citation"]) - mock_citation_class.return_value = mock_citation_instance - - # Mock Summarize LLM - mock_summarize_instance = MagicMock() - mock_summarize_instance.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - mock_genai_cite.merge_citation.return_value = ["Test Citation"] - - # Replace the citation_service's LLM instances with our mocks - citation_service.citation_llm = mock_citation_instance - citation_service.summarize_llm = mock_summarize_instance - - # Mock rerank and filter results - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - mock_filter_mixbread_results.return_value = [{"id": "1", "score": 0.95}] - - # Mock credibility metrics - mock_get_credibility_metrics.return_value = [{"status": "success", "data": {"title": "Test Source"}}] - mock_calculate_overall_score.return_value = { - "overall_score": 84.00, - "source_scores": [84.00] - } - - # Mock document processing - mock_doc = MagicMock() - mock_doc.page_content = "Test content" - mock_doc.metadata = {"source": "path1"} - mock_load_document.return_value = [mock_doc] - mock_split_document.return_value = [mock_doc] - mock_append_metadata.return_value = [mock_doc] - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="source", - style=style, - sources=sources - ) - - # Assert - assert result is not None - assert "result" in result - assert "overall_score" in result - assert "sources" in result - assert result["result"] == ["Test Citation"] - assert result["overall_score"] == 84.00 - assert len(result["sources"]) == 1 - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_existing_index( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=True) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Citation LLM - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(return_value=["Test Citation"]) - mock_citation_class.return_value = mock_citation_instance - - # Mock Summarize LLM - mock_summarize_instance = MagicMock() - mock_summarize_instance.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - mock_genai_cite.merge_citation.return_value = ["Test Citation"] - - # Replace the citation_service's LLM instances with our mocks - citation_service.citation_llm = mock_citation_instance - citation_service.summarize_llm = mock_summarize_instance - - # Mock rerank and filter results - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - mock_filter_mixbread_results.return_value = [{"id": "1", "score": 0.95}] - - # Mock credibility metrics - mock_get_credibility_metrics.return_value = [{"status": "success", "data": {"title": "Test Source"}}] - mock_calculate_overall_score.return_value = { - "overall_score": 84.00, - "source_scores": [84.00] - } - - # Mock document processing - mock_doc = MagicMock() - mock_doc.page_content = "Test content" - mock_doc.metadata = {"source": "path1"} - mock_load_document.return_value = [mock_doc] - mock_split_document.return_value = [mock_doc] - mock_append_metadata.return_value = [mock_doc] - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is not None - assert "result" in result - assert "overall_score" in result - assert "sources" in result - assert result["result"] == ["Test Citation"] - assert result["overall_score"] == 84.00 - assert len(result["sources"]) == 1 - -@pytest.mark.asyncio -async def test_process_citation_error_handling(citation_service, mock_pinecone): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - mock_pinecone.hybrid_query.side_effect = Exception("Test error") - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False - -@pytest.mark.asyncio -async def test_generate_index_name(citation_service): - # Arrange - search_key = "Test Search Key With Spaces" - - # Act - index_name = citation_service._generate_index_name(search_key) - - # Assert - assert isinstance(index_name, str) - assert len(index_name) <= 64 # Assuming LLMEC.INDEX_NAME_LEN is 64 - assert "-" in index_name - assert index_name.endswith("a") - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_mla_style( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "MLA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Citation LLM - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(return_value=["Test MLA Citation"]) - mock_citation_class.return_value = mock_citation_instance - - # Mock Summarize LLM - mock_summarize_instance = MagicMock() - mock_summarize_instance.getKeywordSearchTerm = MagicMock(return_value="test-keyword") - mock_genai_cite.merge_citation.return_value = ["Test MLA Citation"] - - # Replace the citation_service's LLM instances with our mocks - citation_service.citation_llm = mock_citation_instance - citation_service.summarize_llm = mock_summarize_instance - - # Mock rerank and filter results - mock_rerank.return_value = [{"id": "1", "score": 0.95}] - mock_filter_mixbread_results.return_value = [{"id": "1", "score": 0.95}] - - # Mock credibility metrics - mock_get_credibility_metrics.return_value = [{"status": "success", "data": {"title": "Test Source"}}] - mock_calculate_overall_score.return_value = { - "overall_score": 84.00, - "source_scores": [84.00] - } - - # Mock document processing - mock_doc = MagicMock() - mock_doc.page_content = "Test content" - mock_doc.metadata = {"source": "path1"} - mock_load_document.return_value = [mock_doc] - mock_split_document.return_value = [mock_doc] - mock_append_metadata.return_value = [mock_doc] - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is not None - assert "result" in result - assert result["result"] == ["Test MLA Citation"] - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_empty_content( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "" - style = "APA" - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_llm_error( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock Citation LLM to raise an error - mock_citation_instance = AsyncMock() - mock_citation_instance.cite = AsyncMock(side_effect=CitationGenerationError("Test error")) - mock_citation_class.return_value = mock_citation_instance - - # Replace the citation_service's LLM instance with our mock - citation_service.citation_llm = mock_citation_instance - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_supplement_urls_failure( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service, - mock_scraper -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - sources = [ - Source( - url="http://test.com", - title="Test Source", - content="Test source content", - authors="Test Author" - ) - ] - - # Mock SearchApi to raise an error - mock_search_api.clean_search = AsyncMock(side_effect=Exception("Search API error")) - - # Mock Scraper to raise an error - mock_scraper.get_pdfs = AsyncMock(side_effect=Exception("Scraper error")) - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="web", - style=style, - sources=sources, - supplement_urls=True - ) - - # Assert - assert result is False - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_search_api_error( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi to raise an error - mock_search_api.clean_search = AsyncMock(side_effect=Exception("Search API error")) - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False - assert mock_search_api.clean_search.call_count == 1 - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_pinecone_error( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone to raise an error - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(side_effect=Exception("Pinecone error")) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False - -@pytest.mark.asyncio -@patch('src.services.citation_service.SearchApi') -@patch('src.services.citation_service.rerank') -@patch('src.services.citation_service.filter_mixbread_results') -@patch('src.services.citation_service.get_credibility_metrics') -@patch('src.services.citation_service.calculate_overall_score') -@patch('src.llm.Async_prepare_data_for_embedding.load_document') -@patch('src.llm.Async_prepare_data_for_embedding.split_document') -@patch('src.llm.Async_prepare_data_for_embedding.append_metadata') -@patch('src.llm.Pinecone.PineconeOperations') -@patch('src.services.citation_service.Citation') -@patch('src.llm.chat_llm.Azure_llm.Genai_cite') -async def test_process_citation_document_processing_error( - mock_genai_cite, - mock_citation_class, - mock_pinecone_class, - mock_append_metadata, - mock_split_document, - mock_load_document, - mock_calculate_overall_score, - mock_get_credibility_metrics, - mock_filter_mixbread_results, - mock_rerank, - mock_search_api, - citation_service -): - # Arrange - title = "Test Title" - content = "Test content" - style = "APA" - - # Mock SearchApi - mock_search_api.clean_search = AsyncMock(return_value={ - "links": ["url1"], - "meta": {"url1": {"title": "Test Source"}} - }) - - # Mock Pinecone - mock_pinecone_instance = AsyncMock() - mock_pinecone_instance.hybrid_query = AsyncMock(return_value={ - "matches": [{"id": "1", "score": 0.9}] - }) - mock_pinecone_instance.set_current_index = AsyncMock(return_value=False) - mock_pinecone_instance.create_index = AsyncMock(return_value=True) - mock_pinecone_instance.get_idx_stat = AsyncMock(side_effect=[0, 1]) - mock_pinecone_instance.upsert_documents = AsyncMock() - mock_pinecone_class.return_value = mock_pinecone_instance - - # Replace the citation_service's Pinecone instance with our mock - citation_service.PC = mock_pinecone_instance - - # Mock document processing to raise an error - mock_load_document.side_effect = Exception("Document processing error") - - # Act - result = await citation_service.process_citation( - title=title, - content=content, - form_type="auto", - style=style - ) - - # Assert - assert result is False \ No newline at end of file diff --git a/backend/mainService/test/test_health_controller.py b/backend/mainService/test/test_health_controller.py deleted file mode 100644 index 968b038..0000000 --- a/backend/mainService/test/test_health_controller.py +++ /dev/null @@ -1,8 +0,0 @@ -import pytest -from fastapi.testclient import TestClient - -def test_health_check(test_client): - response = test_client.get("/health") - - assert response.status_code == 200 - assert response.json() == {"status": "Healthy"} \ No newline at end of file diff --git a/backend/mainService/test/test_source_credibility_metric_service.py b/backend/mainService/test/test_source_credibility_metric_service.py deleted file mode 100644 index 229b572..0000000 --- a/backend/mainService/test/test_source_credibility_metric_service.py +++ /dev/null @@ -1,140 +0,0 @@ -import pytest -from unittest.mock import patch, AsyncMock -from src.services.source_credibility_metric_service import get_credibility_metrics, calculate_overall_score - -@pytest.mark.asyncio -async def test_get_credibility_metrics_success(): - # Arrange - sources = [ - { - "title": "Test Source", - "link": "http://test.com", - "domain": "test.com", - "journal": "Test Journal", - "citation_doi": "10.1234/test", - "citation_references": ["ref1", "ref2"], - "publication_date": "2024-01-01", - "author_name": "Test Author", - "abstract": "Test abstract", - "issn": "1234-5678", - "type": "journal" - } - ] - - mock_response = [ - { - "status": "success", - "data": { - "credibility_score": 0.85, - "metrics": { - "authority": 0.9, - "reliability": 0.8 - } - } - } - ] - - with patch('aiohttp.ClientSession.post') as mock_post: - mock_post.return_value.__aenter__.return_value.status = 200 - mock_post.return_value.__aenter__.return_value.json = AsyncMock(return_value=mock_response) - - # Act - result = await get_credibility_metrics(sources) - - # Assert - assert result == mock_response - mock_post.assert_called_once() - -@pytest.mark.asyncio -async def test_get_credibility_metrics_api_error(): - # Arrange - sources = [{"title": "Test Source"}] - - with patch('aiohttp.ClientSession.post') as mock_post: - mock_post.return_value.__aenter__.return_value.status = 500 - - # Act - result = await get_credibility_metrics(sources) - - # Assert - assert result == [] - -@pytest.mark.asyncio -async def test_get_credibility_metrics_exception(): - # Arrange - sources = [{"title": "Test Source"}] - - with patch('aiohttp.ClientSession.post', side_effect=Exception("API Error")): - - # Act - result = await get_credibility_metrics(sources) - - # Assert - assert result == [] - -@pytest.mark.asyncio -async def test_calculate_overall_score_success(): - # Test data - credibility_metrics = [ - { - "status": "success", - "data": {"credibility_score": 0.8} - } - ] - sources_with_scores = [ - { - "rerank_score": 0.9 - } - ] - - result = await calculate_overall_score(credibility_metrics, sources_with_scores) - assert isinstance(result, dict) - assert "overall_score" in result - assert "source_scores" in result - assert result["overall_score"] == 54.32 # ((0.9 *100 )*0.6 + 0.8 * 0.4) - -@pytest.mark.asyncio -async def test_calculate_overall_score_empty(): - result = await calculate_overall_score([], []) - assert result["overall_score"] == 0.00 - assert result["source_scores"] == [] - -@pytest.mark.asyncio -async def test_calculate_overall_score_mixed_status(): - credibility_metrics = [ - {"status": "success", "data": {"credibility_score": 0.8}}, - {"status": "failed", "data": {"credibility_score": 0.5}} - ] - sources_with_scores = [ - {"rerank_score": 0.9}, - {"rerank_score": 0.7} - ] - - result = await calculate_overall_score(credibility_metrics, sources_with_scores) - print(result) - assert len(result["source_scores"]) == 2 - assert result["source_scores"][0] == 54.32 - -@pytest.mark.asyncio -async def test_calculate_overall_score_missing_data(): - credibility_metrics = [ - {"status": "success", "data": {}} - ] - sources_with_scores = [ - {"rerank_score": 0.9} - ] - - result = await calculate_overall_score(credibility_metrics, sources_with_scores) - assert result["overall_score"] == 0.00 - -@pytest.mark.asyncio -async def test_calculate_overall_score_exception(): - credibility_metrics = [ - {"status": "success", "data": None} - ] - sources_with_scores = [ - {"rerank_score": 0.9} - ] - - result = await calculate_overall_score(credibility_metrics, sources_with_scores) - assert result["overall_score"] == 0.00 \ No newline at end of file diff --git a/backend/metricsService/.env.example b/backend/metricsService/.env.example deleted file mode 100644 index 79ce8d2..0000000 --- a/backend/metricsService/.env.example +++ /dev/null @@ -1,5 +0,0 @@ -# Redis Configuration -REDIS_HOST=localhost -REDIS_PORT=6379 -REDIS_DB=0 -REDIS_PASSWORD=your_redis_password_here \ No newline at end of file diff --git a/backend/metricsService/Dockerfile b/backend/metricsService/Dockerfile deleted file mode 100644 index 5cf1198..0000000 --- a/backend/metricsService/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -# Installs essential tools for compiling software from source, often needed for Python package dependencies.(build-essential) -# Removes the package lists downloaded during the update to reduce the image size. -RUN apt-get update && apt-get install -y \ - build-essential \ - && rm -rf /var/lib/apt/lists/* - -# Set the PATH environment variable to include /app -ENV PATH="/app:${PATH}" - -# Copy requirements first to leverage Docker cache -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application -COPY ./src/ /app/src/ - -RUN cd /app/src - -# Expose the port the app runs on -EXPOSE 8000 - -# Command to run the application -CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/backend/metricsService/README.md b/backend/metricsService/README.md deleted file mode 100644 index 04ae6d996f6f9f3ae4204e3fa4d97089c446b3d4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5238 zcmchb*=`&~6o%^>iFau1o5W)5*p3jg#6>cWNx&At9uSMr?6xQIEX?BALc9bI4E*0Y zT~pmN9xuT{qweXhI?I2SI(4f5`FlNNp&EK&8k)L#VHgHBG7jC)(R)kRh0R}U{6wSU zaH;pUo<0qq*!4m)2jZVd!i(@Wya^TY@s5nl{u5UgPNik6@wR9#bf4+&+6Upx#&6a> z5iOiuNzC+gs(aIAy#rAo*RALqF87S%oSDq2P=n0l2JVP+W*Qvxskq65 zru6m1L5!j8hMl>FOf%rY=F1$-NdMykf`NG>+IDQ1z=#b*8h^OPXeIVcC4)Te>lv-g z^@OLdE6S=$t84EXF03bFNCyL`$WG_(m_{_g&*gc!eB)7Y znH94L9KQ>5&k*6GfiSX0rr}B#))nRRzMESGFY5-onucK0_%DTo%<-(t!UsKJw_BaG zQ3ue+%D}?|ixK)WjWspa)4e4R#8vs+G#?8uITw9NUU?>yS+Qc>$2T+6?G*%x6X|SA z2lsK8Ydwo2Wi8K` zD3cXQ1c&k~HIH=)Ui?N4#&^rovhYOXU|_MNG#Xcb&I@mHFY{^T+ab{xdS__{S_^1KJ9lCCkdg68&+u@lau&o-q z8=lMZvdprqcfx>22hw^mm+-2DA`9OvP!-XP#Lr4415+Hp#9MbRb4RE^RMG!-elKHb zN)p*a#jQ(TXU_XfJpABY{dAi&?4!m-A7RnCd9Bzb zK5Ch-PM1aw{8`A2xYo&b&-h{|@c(Cc-FmgDVtKB0w2bbxY$H>M6Z;!`c{{w%E&5t( z@*%4Z?kH_h+hWb|IzZRSzV5korqu>UI+G)fky7N9|PR?G*?H(*Hq}r-q~4r(*M4^8?o1 z*Jd;?YWHff$_F#+!r9ZW%zH5>9-{)5dsknZzgk4S^T-PIGLC#~-Aw$CS3i@?|5`_r zq{phKN#bMH)m2g-QeTsdkFB#s33u1qNbPl(x8E$sZmPp%_4nF`=%YA0u)?S|>>cc8 zxY&$g?-Y%?zW8EQ%Zo*eMJVmK~exnvm zatVvwB&V2x_1$Np@apItWn8UQje_SPJ$vj8i+yLT!pmK{cirgou7{_p`mkf|3HAI- zKs|h`x1YmT@~!tBbXrtSdM@wO>Z-ThysjrnNQ*mURg~1ch~;kUYqn2iIf_g^qIYH| zxc!8h`ebxfEDQREm)u~dDB@^Y#t^v_>rK%`yzD9Ho$BY6K|Jf-;#j~|4A5_C@i%Cs;^`W9g*I2{|I`X|DoI~B|?buOc*N+X3vtvHIDZh8a z7uKmgo$t>L=}5Yd^!$@zexmO~zX*+^U=P1C?VLUqPa*yVBzoOFL!ZVe$anR!@3D(s zn4jZ_S;T7NB+2&x|87u|6uS7dvTCxOb89?t#`)w?&(7kxd!!5i50c6&A!**%Jl*|J zwDHu0kV72;Yp_k`%wgVcjC%Z?OhI>Y7Je*xvwFNFXA diff --git a/backend/metricsService/main.py b/backend/metricsService/main.py deleted file mode 100644 index 4c0bcf8..0000000 --- a/backend/metricsService/main.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Citation Credibility Service API - -This is the main entry point for the Citation Credibility Service API, which provides -endpoints for analyzing and scoring the credibility of academic citations and sources. - -Key Features: -- RESTful API endpoints for credibility analysis -- CORS support for cross-origin requests -- Versioned API endpoints (/api/v1) -- Health check endpoint -- Configurable through environment variables - -Configuration: -- Environment variables are loaded from .env file -- CORS is configured to allow all origins -- Logging is configured through src.utils.logging_config - -Example Usage: - $ uvicorn src.main:app --reload - -Deployment: - The service can be deployed using any ASGI server (e.g. uvicorn, hypercorn) - and is configured to run on port 8000 by default. -""" - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware -from src.utils.logging_config import get_logger -from src.api.endpoints import router as api_router -from dotenv import load_dotenv - -# Load environment variables -load_dotenv() - -# Get logger -logger = get_logger(__name__) - - -app = FastAPI( - title="Citation Credibility Service", - description="API for calculating credibility scores of academic sources", - version="1.0.0", -) - -# Add CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include versioned routers -app.include_router(api_router, prefix="/api/v1") - -# Health check endpoint -@app.get("/") -async def root(): - return {"message": "Citation Credibility Service is running"} - -# Uncomment to run via uvicorn directly -# if __name__ == "__main__": -# import uvicorn -# port = int(os.environ.get("PORT", 8000)) -# uvicorn.run("main:app", host="0.0.0.0", port=port, reload=True) diff --git a/backend/metricsService/pytest.ini b/backend/metricsService/pytest.ini deleted file mode 100644 index 0102b0a..0000000 --- a/backend/metricsService/pytest.ini +++ /dev/null @@ -1,2 +0,0 @@ -[pytest] -asyncio_default_fixture_loop_scope = function diff --git a/backend/metricsService/requirements-test.txt b/backend/metricsService/requirements-test.txt deleted file mode 100644 index f17b55e..0000000 --- a/backend/metricsService/requirements-test.txt +++ /dev/null @@ -1,5 +0,0 @@ -pytest==8.3.5 -pytest-asyncio==0.26.0 -pytest-cov==4.1.0 -httpx==0.25.2 -pytest-mock==3.12.0 \ No newline at end of file diff --git a/backend/metricsService/requirements.txt b/backend/metricsService/requirements.txt deleted file mode 100644 index 3d56cd0..0000000 --- a/backend/metricsService/requirements.txt +++ /dev/null @@ -1,10 +0,0 @@ -redis>=4.2.0 -fastapi==0.115.11 -issn==0.0.3 -pydantic==2.10.6 -pytest==8.3.4 -python-dotenv==1.0.1 -Requests==2.32.3 -scholarly==1.7.11 -uvicorn - diff --git a/backend/metricsService/src/__init__.py b/backend/metricsService/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/backend/metricsService/src/api/endpoints.py b/backend/metricsService/src/api/endpoints.py deleted file mode 100644 index ecdf15a..0000000 --- a/backend/metricsService/src/api/endpoints.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -API Endpoints Module - -This module contains all the API endpoints for the Citation Credibility Service. -It handles both single and batch requests for credibility analysis, with support -for detailed results and controlled concurrency. - -Endpoints: -- POST /credibility: Calculate credibility score for a single source -- POST /credibility/batch: Calculate credibility scores for multiple sources -- GET /health: Health check endpoint - -Features: -- Type validation using Pydantic models -- Detailed and summary response formats -- Controlled concurrency for batch processing -- Configurable timeout per request -- Comprehensive error handling - -Implementation Details: -- Uses FastAPI's APIRouter for endpoint organization -- Leverages asyncio for concurrent processing -- Implements semaphore-based concurrency control -- Provides timeout protection for individual requests -- Logs all errors for debugging and monitoring -""" - -from fastapi import APIRouter, HTTPException, Query -from typing import List, Dict, Any -import asyncio -from src.models.schemas import CredibilityRequest, BatchCredibilityRequest -from src.services.credibility_service import calculate_credibility -from src.utils.logging_config import get_logger -from src.utils.api_config import MAX_CONCURRENT_WORKERS, DEFAULT_CONCURRENT_WORKERS - -logger = get_logger(__name__) -router = APIRouter() - -@router.post("/credibility", response_model=Dict[str, Any]) -async def compute_credibility( - request: CredibilityRequest, - detailed: bool = Query(default=False, description="Return detailed results") -): - """Calculate credibility score for a single source""" - try: - result = await calculate_credibility(request) - # Store the total score before modifying the result - total_score = result.get("total_score", 0) - - if detailed: - # Don't pop the total_score, just create a new dict without it for components - components = {k: v for k, v in result.items() if k != "total_score"} - return { - "status": "success", - "data": { - "credibility_score": total_score, - "component": components, - "url": request.domain, - "title": request.title, - "type": request.type - } - } - else: - return { - "status": "success", - "data": { - "credibility_score": total_score, - "url": request.domain, - "title": request.title, - "type": request.type - } - } - except Exception as e: - logger.exception(f"Error calculating credibility: {str(e)}") - raise HTTPException( - status_code=400, - detail="Error calculating credibility score" - ) - -@router.post("/credibility/batch", response_model=List[Dict[str, Any]]) -async def compute_credibility_batch( - requests: BatchCredibilityRequest, - detail: bool = Query(default=False, description="Return detailed results"), - max_concurrent: int = Query(default=10, description="Maximum concurrent requests"), - timeout: float = Query(default=30.0, description="Timeout per request in seconds") -): - """Calculate credibility scores for multiple sources with controlled concurrency""" - try: - logger.info(f"Received batch of {len(requests.sources)} requests") - logger.info(f"requests: {requests}") - - # Create semaphore for controlled concurrency - semaphore = asyncio.Semaphore(min(max(max_concurrent, DEFAULT_CONCURRENT_WORKERS), MAX_CONCURRENT_WORKERS)) - - async def process_request_with_semaphore(req): - async with semaphore: - try: - async with asyncio.timeout(timeout*3): - result = await calculate_credibility(req) - if detail: - return { - "status": "success", - "data": result - } - else: - return { - "status": "success", - "data": { - "credibility_score": result["total_score"], - "url": req.link, - "title": req.title, - "type": req.type - } - } - except asyncio.TimeoutError: - return { - "status": "error", - "data": { - "credibility_score": 0, - "url": req.link, - "title": req.title, - "type": req.type, - "error": "Request timed out" - } - } - except Exception as e: - return { - "status": "error", - "data": { - "credibility_score": 0, - "url": req.link, - "title": req.title, - "type": req.type, - "error": str(e) - } - } - - # Process all requests concurrently - results = await asyncio.gather( - *(process_request_with_semaphore(req) for req in requests.sources) - ) - - logger.info(f"Completed batch processing of {len(results)} requests") - return results - - except Exception as e: - logger.exception(f"Error processing batch request: {str(e)}") - raise HTTPException( - status_code=500, - detail="Error processing batch request" - ) - -@router.get("/health") -async def health_check(): - #"""API health check endpoint""" - return {"status": "healthy"} diff --git a/backend/metricsService/src/models/schemas.py b/backend/metricsService/src/models/schemas.py deleted file mode 100644 index bd66625..0000000 --- a/backend/metricsService/src/models/schemas.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Data Models and Schemas Module - -This module contains all Pydantic models used for request validation and response -formatting in the Citation Credibility Service API. These models ensure type safety -and provide automatic documentation for the API endpoints. - -Key Models: -- CredibilityRequest: Input model for single credibility analysis -- ComponentScore: Model for individual credibility components -- CredibilityResponse: Standard response format for credibility results -- SimplifiedCredibilityResponse: Compact response format -- BatchCredibilityRequest: Input model for batch credibility analysis - -Features: -- Automatic field validation and type checking -- Detailed field descriptions for API documentation -- Support for optional fields with default values -- Nested model structures for complex data -- Consistent response formats - -Usage: -These models are used throughout the API endpoints to validate incoming requests -and format responses. They also generate the API documentation automatically. -""" - -from pydantic import BaseModel, Field, ConfigDict -from typing import Optional, List, Dict, Any - -class CredibilityRequest(BaseModel): - domain: Optional[str] = Field(None, description="Website domain of the publication") - citation_doi: Optional[str] = Field(None, description="DOI of the publication") - journal: Optional[str] = Field(None, description="Journal name") - publication_date: Optional[str] = Field( - None, - alias="publicationDate", - description="Publication date or year" - ) - author_id: Optional[str| List[str]] = Field(None, description="ORCID ID of the author") - author_name: Optional[str| List[str]] = Field( - None, - alias="authors", - description="Author name" - ) - title: Optional[str] = Field(None, description="Title of the publication") - type: Optional[str] = Field(None, description="Type of publication (article, book, etc.)") - issn: Optional[str] = Field(None, description="ISSN of the journal") - link: Optional[str] = Field( - None, - alias="url", - description="Link to the publication" - ) - - model_config = ConfigDict(populate_by_name=True) - -class ComponentScore(BaseModel): - score: float = Field(..., description="Raw score out of 100") - weighted_score: float = Field(..., description="Score after applying weight") - weight: float = Field(..., description="Weight factor applied") - available: bool = Field(False, description="Whether this data was available") - -class CredibilityResponse(BaseModel): - status: str = Field("success", description="Status of the request") - data: Dict[str, Any] = Field(..., description="Credibility score data") - -class SimplifiedCredibilityResponse(BaseModel): - status: str = Field("success", description="Status of the request") - data: Dict[str, Any] = Field(..., description="Simplified credibility data") - -class BatchCredibilityRequest(BaseModel): - sources: List[CredibilityRequest] = Field(..., description="List of sources to evaluate") diff --git a/backend/metricsService/src/services/author_reputation.py b/backend/metricsService/src/services/author_reputation.py deleted file mode 100644 index 5ca821a..0000000 --- a/backend/metricsService/src/services/author_reputation.py +++ /dev/null @@ -1,170 +0,0 @@ -""" -Author Reputation Service Module - -This module handles the calculation of author reputation scores by aggregating data -from multiple academic sources. It provides a unified interface for assessing -author credibility based on their publication history and impact. - -Key Functions: -- get_authorship_reputation: Main function that calculates reputation score -- get_openalex_author_reputation: Fetches data from OpenAlex -- get_semantic_scholar_author_reputation: Fetches data from Semantic Scholar -- get_google_scholar_author_reputation: Fetches data from Google Scholar - -Data Sources: -- ORCID API -- OpenAlex API -- Semantic Scholar API -- Google Scholar (via scholarly package) - -Scoring Methodology: -- Combines h-index and publication count from multiple sources -- Uses weighted average favoring h-index (70% weight) -- Ensures minimum score for any author with an ID or name -- Takes maximum values across sources for robustness - -Features: -- Asynchronous API calls for better performance -- Rate limiting to prevent API abuse -- Comprehensive error handling and logging -- Fallback mechanism when sources fail -""" - -import asyncio -import requests -from typing import Optional, List -from scholarly import scholarly -from ..utils.api_config import ( - ORCID_API, - SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, - OPEN_ALEX_AUTHOR_API, - DEFAULT_TIMEOUT -) -from ..utils.api_utils import rate_limit -from ..utils.logging_config import get_logger - -logger = get_logger(__name__) - -async def get_authorship_reputation(author_id: Optional[str] = None, author_name: Optional[str | List[str]] = None) -> float: - """ - Fetch author reputation from ORCID (if available) and concurrently from OpenAlex, - Semantic Scholar, and Google Scholar using the author's name. If one source fails, - the others serve as fallback. When all are available, combine their outputs by averaging - h-index values and taking the maximum publication count. - """ - if not author_id and not author_name: - return 0 - - h_index_values = [] - pub_count_values = [] - - # ORCID lookup if available - if author_id: - await rate_limit() - try: - orcid_response = requests.get( - f"{ORCID_API}{author_id}/works", - headers={"Accept": "application/json"}, - timeout=DEFAULT_TIMEOUT - ) - if orcid_response.status_code == 200: - orcid_data = orcid_response.json() - works = orcid_data.get("group", []) - pub_count_values.append(len(works)) - # Add a base score for having an ORCID - h_index_values.append(20) - except Exception as e: - logger.exception(f"ORCID API error: {e}") - - # Concurrent calls for OpenAlex, Semantic Scholar, and Google Scholar - if author_name and isinstance(author_name, list): - author_name = author_name[0] # Use the first name in the list for now - if author_name: - tasks = [ - asyncio.create_task(get_openalex_author_reputation(author_name)), - asyncio.create_task(get_semantic_scholar_author_reputation(author_name)), - asyncio.create_task(get_google_scholar_author_reputation(author_name)) - ] - results = await asyncio.gather(*tasks) - error_count = 0 - for result in results: - if isinstance(result, Exception): - logger.exception(f"Error in author reputation fetch: {result}") - error_count += 1 - if error_count == 3: - raise Exception("Multiple errors in author reputation fetch") - if result: - if result.get("h_index", 0): - h_index_values.append(result["h_index"]) - if result.get("pub_count", 0): - pub_count_values.append(result["pub_count"]) - - # Combine the results: take the highest h-index and publication count values - combined_h_index = max(h_index_values) if h_index_values else 0 - combined_pub_count = max(pub_count_values) if pub_count_values else 0 - - # Compute scores with more generous scaling - h_index_score = min(100, combined_h_index * 10) # h-index of 10+ yields max score - pub_count_score = min(100, combined_pub_count * 1) # 100+ publications yields max score - - # Weighted average favoring h-index - final_reputation = (h_index_score * 0.7) + (pub_count_score * 0.3) - - # Ensure minimum score of 20 for any author with an ID or name - if author_id or author_name: - final_reputation = max(20, final_reputation) - - return final_reputation - -async def get_openalex_author_reputation(author_name: str): - """Fetch author reputation from OpenAlex using the authors endpoint.""" - await rate_limit() - try: - response = requests.get(f"{OPEN_ALEX_AUTHOR_API}?search={author_name}", timeout=DEFAULT_TIMEOUT) - if response.status_code == 200: - data = response.json() - if data.get("results"): - first_author = data["results"][0] - h_index = first_author.get("h_index", 0) - works_count = first_author.get("works_count", 0) - return {"h_index": h_index, "pub_count": works_count} - except Exception as e: - logger.exception(f"OpenAlex API error: {e}") - raise Exception(str(e)) from e - - return None - -async def get_semantic_scholar_author_reputation(author_name: str): - """Fetch author reputation from Semantic Scholar using the author search endpoint.""" - await rate_limit() - try: - params = {"query": author_name, "fields": "hIndex,paperCount", "limit": 1} - response = requests.get(SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API, params=params, timeout=DEFAULT_TIMEOUT) - if response.status_code == 200: - data = response.json() - if data.get("data") and len(data["data"]) > 0: - first_author = data["data"][0] - h_index = first_author.get("hIndex", 0) - paper_count = first_author.get("paperCount", 0) - return {"h_index": h_index, "pub_count": paper_count} - except Exception as e: - logger.exception(f"Semantic Scholar API error: {e}") - raise Exception(str(e)) from e - - return None - -async def get_google_scholar_author_reputation(author_name: str): - """Fetch author reputation from Google Scholar using the scholarly package.""" - try: - # Wrap the synchronous scholarly call in asyncio.to_thread for non-blocking execution - result = await asyncio.to_thread(lambda: next(scholarly.search_author(author_name), None)) - if result: - author_data = await asyncio.to_thread(lambda: scholarly.fill(result)) - h_index = author_data.get("hindex", 0) # scholarly returns 'hindex' in lowercase - pub_count = len(author_data.get("publications", [])) - return {"h_index": h_index, "pub_count": pub_count} - else: - return None - except Exception as e: - logger.exception(f"Google Scholar error: {e}") - raise Exception(str(e)) from e diff --git a/backend/metricsService/src/services/citation_data.py b/backend/metricsService/src/services/citation_data.py deleted file mode 100644 index a14b93d..0000000 --- a/backend/metricsService/src/services/citation_data.py +++ /dev/null @@ -1,92 +0,0 @@ -""" -Citation Data Service Module - -This module handles the retrieval and processing of citation data for academic -publications. It aggregates citation counts from multiple sources to provide -a more comprehensive assessment of a publication's impact. - -Key Functions: -- get_citation_data: Main function that fetches and processes citation data - -Data Sources: -- Crossref API -- OpenCitations API - -Scoring Methodology: -- Takes maximum citation count from available sources -- Applies logarithmic scaling to normalize scores -- Caps maximum score at 100 -- Returns 0 if no citations found - -Features: -- Retry mechanism for failed API calls -- Rate limiting to prevent API abuse -- Comprehensive error handling and logging -- Fallback mechanism when sources fail -""" - -import requests -from typing import Optional -from ..utils.api_config import CROSSREF_API, OPEN_CITATIONS_API -from ..utils.api_utils import rate_limit -from ..utils.api_utils import retry_on_failure -from ..utils.logging_config import get_logger -import asyncio - -logger = get_logger(__name__) - - -@retry_on_failure(max_retries=3, delay=1) -async def get_citation_data(doi: str) -> float: - """Fetch citation count from Crossref and OpenCitations concurrently.""" - if not doi: - return 0 - - async def fetch_crossref(): - try: - await rate_limit() - response = requests.get(f"{CROSSREF_API}/{doi}", timeout=10) - if response.status_code == 200: - data = response.json() - return data.get("message", {}).get("is-referenced-by-count", 0) - except Exception as e: - logger.warning(f"Crossref API error: {e}") - return 0 - - async def fetch_opencitations(): - try: - await rate_limit() - response = requests.get( - f"{OPEN_CITATIONS_API}citations/{doi}", - timeout=10 - ) - if response.status_code == 200: - data = response.json() - return len(data) - except Exception as e: - logger.warning(f"OpenCitations API error: {e}") - return 0 - - try: - # Run both API calls concurrently - crossref_count, opencitations_count = await asyncio.gather( - fetch_crossref(), - fetch_opencitations() - ) - - citation_count = max(crossref_count, opencitations_count) - if citation_count == 0: - return 0 - - # Optimized scoring calculation - if citation_count < 10: - return citation_count * 2 - elif citation_count < 100: - return min(100, 20 + citation_count) - elif citation_count < 1000: - return min(100, 50 + (citation_count // 10)) - else: - return 100 - except Exception as e: - logger.error(f"Error in get_citation_data: {e}") - raise diff --git a/backend/metricsService/src/services/credibility_service.py b/backend/metricsService/src/services/credibility_service.py deleted file mode 100644 index 6edc27b..0000000 --- a/backend/metricsService/src/services/credibility_service.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Credibility Service Module - -This module contains the core business logic for calculating credibility scores -of academic sources. It combines multiple factors into a weighted score and -provides caching for improved performance. - -Key Functions: -- calculate_credibility: Main function that computes credibility score -- get_credibility_level: Converts numeric score to qualitative level - -Scoring Methodology: -The credibility score is calculated as a weighted sum of these factors: -- Domain Reputation (20%) -- Citation Count (20%) -- Journal Impact (25%) -- Recency (10%) -- Authorship Reputation (25%) - -Features: -- Asynchronous execution of scoring components -- Caching of results to improve performance -- Comprehensive error handling and logging -- Weighted scoring system -- Normalization of final score - -Implementation Details: -- Uses asyncio for concurrent processing of scoring components -- Implements caching using src.utils.cache -- Provides detailed logging through src.utils.logging_config -""" - -import asyncio -from typing import Dict, Any, Optional -from urllib.parse import urlparse -from src.models.schemas import CredibilityRequest -from src.services.domain_reputation import get_domain_reputation -from src.services.citation_data import get_citation_data -from src.services.journal_impact import get_journal_impact -from src.services.recency_score import calculate_recency_score -from src.services.author_reputation import get_authorship_reputation -from src.utils.cache import get_cache, set_cache -from src.utils.logging_config import get_logger - -logger = get_logger(__name__) - -async def calculate_credibility(request: CredibilityRequest) -> Dict[str, Any]: - """Calculate credibility score based on multiple factors""" - try: - # Check cache first - cache_key = f"credibility:{request.model_dump()}" - cached_result = await get_cache(cache_key) - if cached_result: - return cached_result - - # Extract and validate parameters - domain = request.domain or (lambda u: (not u.hostname.startswith("www.") and u.hostname) or u.hostname[4:])(urlparse(request.link)) - doi = request.citation_doi - issn = request.issn - journal = request.journal - publication_date = request.publication_date - author_id = request.author_id - author_name = request.author_name - - # Create and execute tasks concurrently - tasks = [ - ('domain', get_domain_reputation(domain), 0.20) if domain else None, - ('citation', get_citation_data(doi), 0.20) if doi else None, - ('journal', get_journal_impact(issn, journal), 0.25) if issn else None, - ('recency', calculate_recency_score(publication_date), 0.10), - ('author', get_authorship_reputation(author_id, author_name), 0.25) if (author_id or author_name) else None - ] - - # Filter out None tasks - valid_tasks = [task for task in tasks if task is not None] - results = {} - total_score = 0 - total_weight = 0 - - if valid_tasks: - async with asyncio.TaskGroup() as tg: - # Create all tasks concurrently - running_tasks = { - key: tg.create_task(coro) for key, coro, _ in valid_tasks - } - - # Process results - for key, _, weight in valid_tasks: - try: - score = await running_tasks[key] - if score is not None: - results[key] = score - total_score += score * weight - total_weight += weight - except Exception as e: - logger.warning(f"Error processing {key}: {str(e)}") - results[key] = 0 - - # Prepare result - result = { - "total_score": round(total_score, 2), - "domain_reputation": round(results.get('domain', 0),2), - "citation_count": results.get('citation', 0), - "journal_impact": results.get('journal', 0), - "recency": results.get('recency', 0), - "authorship_reputation": results.get('author', 0) - } - - # Debug logging - logger.info(f"Final result before caching: {result}") - logger.info(f"Total score calculation: {total_score} (total_weight: {total_weight})") - - # Cache the result - await set_cache(cache_key, result) - - return result - except Exception as e: - logger.error(f"Error calculating credibility: {str(e)}") - raise - -def get_credibility_level(score: float) -> str: - """Get credibility level based on score""" - if score > 85: - return "Very High" - elif score >= 75: - return "High" - elif score >= 60: - return "Moderate" - elif score >= 50: - return "Low" - else: - return "Very Low" diff --git a/backend/metricsService/src/services/domain_reputation.py b/backend/metricsService/src/services/domain_reputation.py deleted file mode 100644 index dd419f5..0000000 --- a/backend/metricsService/src/services/domain_reputation.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Domain Reputation Service Module - -This module handles the assessment of domain reputation by combining -popularity rankings with security assessments. It provides a unified -score that reflects both a domain's authority and security posture. - -Key Functions: -- get_domain_reputation: Main function that calculates domain reputation -- initialize_tranco_data: Fetches and processes TRANCO ranking data - -Data Sources: -- TRANCO List (domain popularity rankings) -- Mozilla Observatory (security assessments) - -Scoring Methodology: -- Combines TRANCO ranking score (70%) with security grade (30%) -- Uses logarithmic scaling for TRANCO rankings -- Maps security grades to numerical scores -- Ensures minimum score for all domains - -Features: -- Caching of TRANCO ranking data -- Asynchronous API calls for better performance -- Rate limiting to prevent API abuse -- Comprehensive error handling and logging -- Fallback mechanism when sources fail -""" - -import asyncio -import requests -from ..utils.api_config import ( - TRANCO_DOMAIN_API, - TRANCO_API, - OBSERVATORY_API -) -from ..utils.api_utils import rate_limit -from src.utils.logging_config import get_logger -import csv -from io import StringIO - -logger = get_logger(__name__) - -def initialize_tranco_data(): - """Fetch and parse TRANCO CSV data into a domain:rank dictionary""" - try: - # Get the latest TRANCO list metadata - response = requests.get(TRANCO_API) - if response.status_code != 200: - logger.error("Failed to fetch TRANCO list metadata") - return {} - - list_metadata = response.json() - if not list_metadata.get('download'): - logger.error("No download URL in TRANCO response") - return {} - - # Download and parse the CSV - csv_response = requests.get(list_metadata['download']) - if csv_response.status_code != 200: - logger.error("Failed to download TRANCO CSV") - return {} - - csv_data = csv_response.text - csv_reader = csv.reader(StringIO(csv_data)) - return {row[1]: int(row[0]) for row in csv_reader} - - except Exception as e: - logger.error(f"Error initializing TRANCO data: {e}") - return {} - -# Initialize TRANCO data at module level -tranco_data = initialize_tranco_data() - -async def get_domain_reputation(domain: str) -> float: - """Fetch domain authority score from Tranco List and Mozilla Observatory.""" - global tranco_data # Declare global at the start of the function - - if not domain: - return 0 - - try: - # Initialize rank - rank = 0 - - # Check if we have valid tranco_data - if tranco_data and isinstance(tranco_data, dict): - rank = tranco_data.get(domain, 0) - else: - # Fetch new data if tranco_data is invalid - try: - tranco_response = requests.get(f"{TRANCO_DOMAIN_API}/{domain}") - if tranco_response.status_code == 200: - tranco_data = tranco_response.json() - rank = tranco_data.get("ranks", [{"rank": 0, "date": ""}])[0].get("rank", 0) - except Exception as e: - logger.exception(f"Tranco API error: {e}") - raise Exception(str(e)) from e - - # Calculate Tranco score - print(f"Rank: {rank}") - if rank == 0: - tranco_score = 0 - elif rank <= 1000: - tranco_score = 80 + (20 * (1 - (rank / 1000))) - elif rank <= 10000: - tranco_score = 60 + (20 * (1 - ((rank - 1000) / 9000))) - elif rank <= 100000: - tranco_score = 40 + (20 * (1 - ((rank - 10000) / 90000))) - elif rank <= 1000000: - tranco_score = 20 + (20 * (1 - ((rank - 100000) / 900000))) - else: - tranco_score = 20 - - # Get Mozilla Observatory security score - await rate_limit() - try: - observatory_response = requests.post( - OBSERVATORY_API, - data={"host": domain, "hidden": "true"}, - timeout=10 - ) - if observatory_response.status_code == 200: - scan_data = observatory_response.json() - scan_id = scan_data.get("scan_id") - if scan_id: - for _ in range(3): - await asyncio.sleep(2) - await rate_limit() - results = requests.get(f"{OBSERVATORY_API}/{domain}", timeout=10).json() - if results.get("state") == "FINISHED": - grade = results.get("grade", "F") - grade_scores = {"A+": 100, "A": 95, "A-": 90, "B+": 85, "B": 80, "B-": 75, - "C+": 70, "C": 65, "C-": 60, "D+": 55, "D": 50, "D-": 45, "F": 40} - observatory_score = grade_scores.get(grade, 20) - break - else: - observatory_score = 0 - else: - observatory_score = 0 - else: - observatory_score = 0 - except Exception as e: - logger.exception(f"Observatory API error: {e}") - observatory_score = 50 - - # Combine scores (70% Tranco, 30% Observatory) - domain_score = (tranco_score * 0.7) + (observatory_score * 0.3) - return domain_score - except Exception as e: - logger.exception(f"Error in get_domain_reputation: {e}") - raise Exception(str(e)) from e diff --git a/backend/metricsService/src/services/journal_impact.py b/backend/metricsService/src/services/journal_impact.py deleted file mode 100644 index e7c3774..0000000 --- a/backend/metricsService/src/services/journal_impact.py +++ /dev/null @@ -1,136 +0,0 @@ -""" -Journal Impact Service Module - -This module handles the assessment of journal impact by combining -data from multiple sources. It provides a unified score that reflects -a journal's quality and reputation in the academic community. - -Key Functions: -- get_journal_impact: Main function that calculates journal impact score - -Data Sources: -- DOAJ (Directory of Open Access Journals) -- ISSN API (International Standard Serial Number database) - -Scoring Methodology: -- Primary source: DOAJ (70% weight) -- Fallback source: ISSN API (30% weight) -- Considers factors like peer review status, journal age, and registration -- Caps maximum score at 100 -- Returns 0 if no data available - -Features: -- Retry mechanism for failed API calls -- Rate limiting to prevent API abuse -- Comprehensive error handling and logging -- Fallback mechanism when primary source fails -""" - -from typing import Optional -from datetime import datetime -import requests -from issn.issn_api import get_by_issn, parse_item -from ..utils.api_config import DOAJ_API, DOAJ_API_WITH_ISSN -from ..utils.api_utils import rate_limit -from ..utils.api_utils import retry_on_failure -from ..utils.logging_config import get_logger -import asyncio - -logger = get_logger(__name__) - -@retry_on_failure(max_retries=3, delay=1) -async def get_journal_impact(issn: Optional[str] = None, journal: Optional[str] = None) -> float: - """Fetch journal impact factor using DOAJ as primary and ISSN API as fallback.""" - if not issn and not journal: - return 0 - - async def fetch_doaj(): - try: - await rate_limit() - url = DOAJ_API_WITH_ISSN.format(issn=issn) if issn else DOAJ_API.format(journal=journal) - response = requests.get(url, timeout=10) - if response.status_code == 200: - data = response.json() - if data.get("total", 0) > 0: - journal_data = data.get("results", [])[0] - score = 50 # Base score for being in DOAJ - - # Check peer review process - review_process = journal_data.get("bibjson", {}).get("editorial", {}).get("review_process", "") - if review_process and "peer review" in review_process: - score += 30 - - # Check journal age - created_date = journal_data.get("created_date", "") - if created_date and len(created_date) >= 4: - try: - journal_year = int(created_date[:4]) - if journal_year <= datetime.now().year - 10: - score += 20 - except ValueError: - pass - return score - else: - return 0 - except Exception as e: - logger.exception(f"DOAJ API error: {e}") - return None - - async def fetch_issn(): - if not issn: - return None - try: - await rate_limit() - issn_data = get_by_issn(issn) - if not issn_data: - return None - - issn_data = parse_item(issn_data) - if not issn_data.get('data'): - return None - - data = issn_data['data'] - score = 50 # Base score for valid ISSN - - # Additional scoring factors - if data.get('CountryCode'): - score += 5 - if data.get('resource', {}).get('URL'): - score += 5 - if data.get('KeyTitle',''): - score += 10 - if (record:=data.get('Record'), {}) and record.get('status','').lower() == 'register': - score += 10 - if data.get('ISSN', {}).get('status') == 'Valid': - score += 20 - else: - score -= 20 - - return score - except Exception as e: - logger.exception(f"ISSN API error") - return None - - try: - # Fetch both scores concurrently - doaj_score, issn_score = await asyncio.gather( - fetch_doaj(), - fetch_issn() - ) - - if doaj_score is None and issn_score is None: - raise Exception("Both DOAJ and ISSN API calls failed") - - # Calculate final score - if doaj_score and issn_score: - return min(100, (doaj_score * 0.6) + (issn_score * 0.4)) - elif doaj_score and doaj_score > 0: - return min(100, doaj_score) - elif issn_score and issn_score > 0: - return min(100, issn_score) - else: - return 0 - - except Exception as e: - logger.exception(f"Error in get_journal_impact: {e}") - raise Exception(str(e)) from e diff --git a/backend/metricsService/src/services/recency_score.py b/backend/metricsService/src/services/recency_score.py deleted file mode 100644 index 6b1233a..0000000 --- a/backend/metricsService/src/services/recency_score.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -Recency Score Service Module - -This module handles the calculation of recency scores for academic publications. -Newer publications receive higher scores, reflecting their timeliness and relevance. - -Key Functions: -- calculate_recency_score: Main function that computes recency score - -Scoring Methodology: -- Scores based on years since publication: - - 0-1 years: 100 - - 1-2 years: 90 - - 2-3 years: 80 - - 3-5 years: 50 - - 5-7 years: 40 - - 7-10 years: 10 - - 10+ years: 0 -- Handles both string and integer date formats -- Extracts year from strings using regex - -Features: -- Comprehensive error handling and logging -- Default scores for invalid dates -- Flexible date parsing -""" - -import re -from datetime import datetime -from typing import Union -from ..utils.logging_config import get_logger -logger = get_logger(__name__) -async def calculate_recency_score(publication_date: Union[str, int]) -> float: - """Calculate recency score - newer publications get higher scores.""" - if not publication_date: - logger.info("No publication date provided") - return 0 - try: - current_year = datetime.now().year - year = None - if isinstance(publication_date, int): - year = publication_date - elif isinstance(publication_date, str): - year_match = re.search(r'(\d{4})', publication_date) - if year_match: - year = int(year_match.group(1)) - if not year or year > current_year: - raise ValueError("Invalid publication date") - years_ago = current_year - year - if years_ago <= 1: - return 100 - elif years_ago <= 2: - return 90 - elif years_ago <= 3: - return 80 - elif years_ago <= 5: - return 50 - elif years_ago <= 7: - return 40 - elif years_ago <= 10: - return 10 - else: - return 0 - except ValueError as e: - logger.exception(f"Invalid publication date: {e}") - return 0 - except Exception as e: - logger.exception(f"Error in calculate_recency_score: {e}") - return 50 diff --git a/backend/metricsService/src/utils/api_config.py b/backend/metricsService/src/utils/api_config.py deleted file mode 100644 index 8b6c9c6..0000000 --- a/backend/metricsService/src/utils/api_config.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -API Configuration Module - -This module contains all API endpoint configurations and constants used -throughout the application. It serves as a centralized location for managing -external service connections. - -Key Constants: -- CROSSREF_API: Crossref API endpoint for citation data -- TRANCO_API: Tranco List API for domain rankings -- DOAJ_API: Directory of Open Access Journals API -- ORCID_API: ORCID API for author information -- SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API: Semantic Scholar author search -- OPEN_ALEX_AUTHOR_API: OpenAlex author data -- OBSERVATORY_API: Mozilla Observatory security scans -- OPEN_CITATIONS_API: OpenCitations metadata - -Features: -- Centralized API configuration -- Easy maintenance of endpoint URLs -- Consistent naming conventions -""" - -# API endpoints and constants -CROSSREF_API = "https://api.crossref.org/works" -TRANCO_API = "https://tranco-list.eu/api/lists/date/latest" -TRANCO_DOMAIN_API = "https://tranco-list.eu/api/ranks/domain/" -DOAJ_API_WITH_ISSN = "https://doaj.org/api/v2/search/journals/issn:{issn}" -DOAJ_API = "https://doaj.org/api/v2/search/journals/{journal}" -ORCID_API = "https://pub.orcid.org/v3.0/" -SEMANTIC_SCHOLAR_AUTHOR_SEARCH_API = "https://api.semanticscholar.org/graph/v1/author/search" -OPEN_ALEX_AUTHOR_API = "https://api.openalex.org/authors" -OBSERVATORY_API = "https://http-observatory.security.mozilla.org/api/v1/analyze" -OPEN_CITATIONS_API = "https://opencitations.net/index/api/v1/" -MAX_CONCURRENT_WORKERS = 20 -DEFAULT_CONCURRENT_WORKERS = 10 -DEFAULT_TIMEOUT = 10 diff --git a/backend/metricsService/src/utils/api_utils.py b/backend/metricsService/src/utils/api_utils.py deleted file mode 100644 index 5df5984..0000000 --- a/backend/metricsService/src/utils/api_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -API Utilities Module - -This module provides utility functions for handling API interactions, -including rate limiting and retry mechanisms. These utilities help -ensure reliable and efficient API communication throughout the application. - -Key Functions: -- rate_limit: Ensures API calls don't exceed rate limits -- retry_on_failure: Decorator for retrying failed API calls - -Features: -- Global rate limiting across all API calls -- Configurable retry mechanism with exponential backoff -- Asynchronous implementation for non-blocking operations -- Comprehensive logging for debugging and monitoring -""" - -import asyncio -import time -import functools -from src.utils.logging_config import get_logger -from typing import Callable, Any - -logger = get_logger(__name__) - -# Rate limiting parameters -API_RATE_LIMIT = 1 # Minimum seconds between API calls -last_api_call_time = 0 - -async def rate_limit(): - """Ensure we don't exceed API rate limits.""" - global last_api_call_time - current_time = time.time() - time_since_last_call = current_time - last_api_call_time - if time_since_last_call < API_RATE_LIMIT: - await asyncio.sleep(API_RATE_LIMIT - time_since_last_call) - last_api_call_time = time.time() - -def retry_on_failure(max_retries: int = 3, delay: float = 1): - """Decorator to retry a function on failure.""" - def decorator(func: Callable): - @functools.wraps(func) - async def wrapper(*args, **kwargs): - for attempt in range(max_retries): - try: - return await func(*args, **kwargs) - except Exception as e: - if attempt == max_retries - 1: - raise - logger.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying in {delay} seconds...") - await asyncio.sleep(delay) - return wrapper - return decorator diff --git a/backend/metricsService/src/utils/cache.py b/backend/metricsService/src/utils/cache.py deleted file mode 100644 index 147a93f..0000000 --- a/backend/metricsService/src/utils/cache.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Cache Management Module - -This module provides caching functionality using Redis as the backend. -When Redis is unavailable, it falls back to an in-memory cache with TTL support. -It includes functions for getting, setting, and clearing cache, as well -as a decorator for caching function results. - -Key Functions: -- get_cache: Retrieves cached data -- set_cache: Stores data in cache -- clear_cache: Clears all cached data -- cache_decorator: Decorator for caching function results - -Caching Strategy: -- Uses Redis as the primary caching backend when available -- Falls back to in-memory cache with TTL support when Redis is unavailable -- Default TTL (Time To Live) of 1 hour -- Automatic serialization/deserialization of data -- Cache key generation based on function arguments - -Features: -- Asynchronous operations -- Comprehensive error handling and logging -- Configurable TTL for cached items -- Decorator for easy function caching -""" - -import json -import time -from typing import Any, Dict, Optional, Tuple -import hashlib -from src.utils.logging_config import get_logger -from src.utils.cache_config import redis_client - -logger = get_logger(__name__) -CACHE_TTL = 3600 # Cache time-to-live in seconds - -# In-memory cache to use when Redis is unavailable -# Structure: {key: (value, expiration_timestamp)} -memory_cache: Dict[str, Tuple[Any, float]] = {} - -def _generate_cache_key(*args, **kwargs) -> str: - """Generate a unique cache key based on function arguments""" - key_str = f"{args}{kwargs}" - return hashlib.md5(key_str.encode()).hexdigest() - -def _clean_expired_cache_entries() -> None: - """Remove expired entries from the in-memory cache""" - current_time = time.time() - expired_keys = [key for key, (_, expiry) in memory_cache.items() if current_time > expiry] - for key in expired_keys: - del memory_cache[key] - -async def get_cache(key: str) -> Optional[Any]: - """Get cached value by key""" - try: - # Try Redis first if available - if redis_client: - cached_data = await redis_client.get(key) - return json.loads(cached_data) if cached_data else None - - # Fall back to in-memory cache when Redis is not available - _clean_expired_cache_entries() # Clean expired entries - if key in memory_cache: - value, expiry = memory_cache[key] - if time.time() <= expiry: - return value - # Remove expired entry - del memory_cache[key] - return None - except Exception as e: - logger.error(f"Error getting cache for key {key}: {str(e)}") - return None - -async def set_cache(key: str, value: Any, ttl: int = CACHE_TTL) -> bool: - """Set cache value with optional TTL""" - try: - # Try Redis first if available - if redis_client: - serialized_value = json.dumps(value) - return await redis_client.set(key, serialized_value, ex=ttl) - - # Fall back to in-memory cache when Redis is not available - expiry_time = time.time() + ttl - memory_cache[key] = (value, expiry_time) - return True - except Exception as e: - logger.error(f"Error setting cache for key {key}: {str(e)}") - return False - -async def clear_cache() -> None: - """Clear all cached values""" - try: - # Clear Redis cache if available - if redis_client: - await redis_client.flushdb() - - # Clear in-memory cache - memory_cache.clear() - except Exception as e: - logger.error(f"Error clearing cache: {str(e)}") - -def cache_decorator(ttl: int = CACHE_TTL): - """Decorator for caching function results""" - def wrapper(func): - async def wrapped(*args, **kwargs): - cache_key = _generate_cache_key(func.__name__, *args, **kwargs) - cached_result = await get_cache(cache_key) - - if cached_result is not None: - return cached_result - - result = await func(*args, **kwargs) - await set_cache(cache_key, result, ttl) - return result - return wrapped - return wrapper \ No newline at end of file diff --git a/backend/metricsService/src/utils/cache_config.py b/backend/metricsService/src/utils/cache_config.py deleted file mode 100644 index c14016d..0000000 --- a/backend/metricsService/src/utils/cache_config.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Cache Configuration Module - -This module handles the configuration and management of the Redis cache -connection. It provides functions for initializing and closing the Redis -connection, which is used throughout the application for caching. - -Key Functions: -- init_redis: Initializes the Redis connection -- close_redis: Closes the Redis connection - -Configuration: -- REDIS_URL: Redis server connection URL -- redis_client: Global Redis client instance - -Features: -- Asynchronous connection management -- Comprehensive error handling and logging -- Type hints for better code clarity -""" - -from typing import Optional -import os -from redis import asyncio as aioredis -from src.utils.logging_config import get_logger - -logger = get_logger(__name__) - -# Redis connection settings from environment variables -redis_client: Optional[aioredis.Redis] = None - -async def init_redis(): - """Initialize Redis connection""" - global redis_client - try: - redis_client = aioredis.Redis( - host=os.getenv('REDIS_HOST', 'localhost'), - port=int(os.getenv('REDIS_PORT', '6379')), - db=int(os.getenv('REDIS_DB', '0')), - password=os.getenv('REDIS_PASSWORD'), - decode_responses=True - ) - except Exception as e: - logger.error(f"Failed to initialize Redis connection: {str(e)}") - raise - -async def close_redis(): - """Close Redis connection""" - if redis_client: - await redis_client.close() diff --git a/backend/metricsService/src/utils/logging_config.py b/backend/metricsService/src/utils/logging_config.py deleted file mode 100644 index a84baf4..0000000 --- a/backend/metricsService/src/utils/logging_config.py +++ /dev/null @@ -1,80 +0,0 @@ -""" -Logging Configuration Module - -This module handles the configuration of the application's logging system. -It sets up both file(optional) and stream handlers with a standardized format for -consistent logging throughout the application. - -Key Functions: -- get_logger: Returns a configured logger instance - -Configuration: -- Log level: INFO -- Log format: Timestamp - Logger Name - Level - Message -- Handlers: File handler - -Features: -- Centralized logging configuration -- Easy logger instance creation -- Both file and stream output -- Standardized log format -""" - -import os -import logging -from datetime import datetime -from typing import Optional -from logging import Logger - -logger = None # Global logger instance - -def setup_logging( - log_level=logging.INFO, - log_dir: str = 'logs', - filename: Optional[str] = 'log', - logToFile: Optional[bool] = False, - ) -> Logger: - - """ - Set up a standardized logging configuration for the entire project. - - Args: - log_level (int): Logging level (default: logging.INFO) - log_dir (str): Directory to store log files (default: 'logs') - filename (str): Base filename for log files (default: 'log') - logToFile (bool): Whether to log to file (default: False) - """ - - global logger - # Create a unique log filename with timestamp - timestamp = datetime.now().strftime("%Y%m%U") - - # Configure logging - logging.basicConfig( - level=log_level, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler() # Also log to console - ] - ) - logger = logging.getLogger(filename) - - if logToFile: - # Ensure logs directory exists - os.makedirs(log_dir, exist_ok=True) - log_filename = os.path.join(log_dir, f'{filename}_{timestamp}.log') - logger.addHandler(logging.FileHandler(log_filename)) - - -def get_logger(filename:str) -> Logger: - """ - Get the configured logger instance. - - Returns: - Logger: Configured logger instance - """ - global logger - if logger is None: - setup_logging(filename=filename) - return logger - diff --git a/backend/metricsService/tests/api/test_endpoints.py b/backend/metricsService/tests/api/test_endpoints.py deleted file mode 100644 index 4530bac..0000000 --- a/backend/metricsService/tests/api/test_endpoints.py +++ /dev/null @@ -1,142 +0,0 @@ -import pytest -from fastapi.testclient import TestClient -from main import app -from unittest.mock import AsyncMock, patch - - -def test_root_endpoint(test_client): - """Test the root endpoint""" - response = test_client.get("/") - assert response.status_code == 200 - assert "message" in response.json() - -@pytest.mark.asyncio -async def test_credibility_endpoint_default_response(test_client): - test_data = { - "domain": "example.com", - "citation_doi": "10.1234/example", - "journal": "Example Journal", - "publication_date": "2023-01-01", - "author_id": "0000-0001-2345-6789", - "title": "Test Title", - "type": "article" - } - - expected_response = { - "status": "success", - "data": { - "credibility_score": 85.0, - "url": "example.com", - "title": "Test Title", - "type": "article" - } - } - - with patch('src.api.endpoints.calculate_credibility', new_callable=AsyncMock) as mock_calculate: - mock_calculate.return_value = { - "total_score": 85.0, - "credibility_level": "High", - "domain_reputation": { - "score": 80.0, - "weighted_score": 16.0, - "weight": 0.2, - "available": True - } - } - - response = test_client.post("/api/v1/credibility", json=test_data) - assert response.status_code == 200 - assert response.json() == expected_response - - - -def test_credibility_endpoint_detailed_response(test_client): - """Test the credibility endpoint with detailed response""" - test_data = { - "domain": "example.com", - "citation_doi": "10.1234/example", - "journal": "Example Journal", - "publication_date": "2023-01-01", - "author_id": "0000-0001-2345-6789", - "title": "Test Title", - "type": "article" - } - - with patch('src.api.endpoints.calculate_credibility', new_callable=AsyncMock) as mock_calculate: - - mock_calculate.return_value = { - "total_score": 85.0, - "credibility_level": "High", - "domain_reputation": { - "score": 80.0, - "weighted_score": 16.0, - "weight": 0.2, - "available": True - }, - "citation_count": { - "score": 75.0, - "weighted_score": 15.0, - "weight": 0.2, - "available": True - }, - "journal_impact": { - "score": 90.0, - "weighted_score": 22.5, - "weight": 0.25, - "available": True - }, - "recency": { - "score": 100.0, - "weighted_score": 10.0, - "weight": 0.1, - "available": True - }, - "authorship_reputation": { - "score": 80.0, - "weighted_score": 20.0, - "weight": 0.25, - "available": True - } - } - - response = test_client.post("http://127.0.0.1:9050/api/v1/credibility?detailed=true", json=test_data) - assert response.status_code == 200 - response_data = response.json() - assert "status" in response_data - assert response_data["status"] == "success" - assert "data" in response_data - assert "credibility_score" in response_data["data"] - assert "domain_reputation" in response_data["data"]["component"] - assert "citation_count" in response_data["data"]["component"] - assert "journal_impact" in response_data["data"]["component"] - assert "recency" in response_data["data"]["component"] - assert "authorship_reputation" in response_data["data"]["component"] - -def test_credibility_endpoint_with_none_score(test_client): - test_data = { - "domain": "example.com", - "citation_doi": "10.1234/example", - "journal": "Example Journal", - "publication_date": "2023-01-01", - "author_id": "0000-0001-2345-6789" - } - - with patch('src.api.endpoints.calculate_credibility', new_callable=AsyncMock) as mock_calculate: - mock_calculate.return_value = { - "total_score": 60.0, - "credibility_level": "Moderate", - "domain_reputation": { - "score": None, - "weighted_score": 0.0, - "weight": 0.2, - "available": True - } - } - - response = test_client.post("/api/v1/credibility", json=test_data) - assert response.status_code == 200 - response_json = response.json() - assert "data" in response_json - assert "credibility_score" in response_json["data"] # Note lowercase - assert response_json["data"]["credibility_score"] == 60.0 - diff --git a/backend/metricsService/tests/conftest.py b/backend/metricsService/tests/conftest.py deleted file mode 100644 index 5721bbb..0000000 --- a/backend/metricsService/tests/conftest.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest -from fastapi.testclient import TestClient -from main import app - -@pytest.fixture -def test_client(): - """Fixture to provide a test client for FastAPI""" - return TestClient(app) - -@pytest.fixture -def mock_credibility_request(): - """Fixture to provide a sample credibility request""" - return { - "domain": "example.com", - "citation_doi": "10.1234/example", - "journal": "Example Journal", - "publication_date": "2023-01-01", - "author_id": "0000-0001-2345-6789", - "author_name": "John Doe" - } diff --git a/backend/metricsService/tests/models/test_schemas.py b/backend/metricsService/tests/models/test_schemas.py deleted file mode 100644 index 7d745a6..0000000 --- a/backend/metricsService/tests/models/test_schemas.py +++ /dev/null @@ -1,85 +0,0 @@ -from src.models.schemas import ComponentScore, CredibilityRequest, CredibilityResponse - - -def test_credibility_request_validation(): - """Test validation of CredibilityRequest model""" - - # Test valid request - valid_data = { - "domain": "example.com", - "citation_doi": "10.1234/example", - "journal": "Example Journal", - "publication_date": "2023-01-01", - "author_id": "0000-0001-2345-6789", - "author_name": "John Doe" - } - request = CredibilityRequest(**valid_data) - assert request.domain == "example.com" - assert request.citation_doi == "10.1234/example" - - # Test minimum required fields - minimal_data = { - "publication_date": "2023-01-01" - } - request = CredibilityRequest(**minimal_data) - assert request.publication_date == "2023-01-01" - assert request.domain is None - -def test_component_score_validation(): - """Test validation of ComponentScore model""" - - # Test valid component score - valid_data = { - "score": 80.0, - "weighted_score": 16.0, - "weight": 0.2, - "available": True - } - component = ComponentScore(**valid_data) - assert component.score == 80.0 - assert component.weighted_score == 16.0 - assert component.weight == 0.2 - assert component.available is True - - # Test with default values - minimal_data = { - "score": 50.0, - "weighted_score": 25.0, - "weight": 0.5 - } - component = ComponentScore(**minimal_data) - assert component.available is False - -def test_credibility_response_validation(): - """Test validation of CredibilityResponse model""" - - # Test valid response - valid_data = { - "status": "success", - "data": { - "total_score": 85.0, - "components": { - "domain_reputation": { - "score": 80.0, - "weighted_score": 16.0, - "weight": 0.2, - "available": True - } - } - } - } - response = CredibilityResponse(**valid_data) - assert response.status == "success" - assert response.data["total_score"] == 85.0 - assert response.data["components"]["domain_reputation"]["score"] == 80.0 - - # Test with minimal data - minimal_data = { - "data": { - "total_score": 50.0, - "components": {} - } - } - response = CredibilityResponse(**minimal_data) - assert response.status == "success" # Default value - assert response.data["total_score"] == 50.0 diff --git a/backend/metricsService/tests/services/test_credibility_service.py b/backend/metricsService/tests/services/test_credibility_service.py deleted file mode 100644 index 550a4fd..0000000 --- a/backend/metricsService/tests/services/test_credibility_service.py +++ /dev/null @@ -1,310 +0,0 @@ -import pytest -from unittest.mock import AsyncMock, patch -from src.services.credibility_service import ( - calculate_credibility, - get_credibility_level, - get_domain_reputation, - get_citation_data, - get_journal_impact, - calculate_recency_score, - get_authorship_reputation -) -from src.models.schemas import CredibilityRequest - -@pytest.mark.asyncio -async def test_calculate_credibility_basic(): - """Test basic credibility calculation""" - with patch('src.services.credibility_service.get_domain_reputation', new_callable=AsyncMock) as mock_domain, \ - patch('src.services.credibility_service.get_citation_data', new_callable=AsyncMock) as mock_citation, \ - patch('src.services.credibility_service.get_journal_impact', new_callable=AsyncMock) as mock_journal, \ - patch('src.services.credibility_service.calculate_recency_score', new_callable=AsyncMock) as mock_recency, \ - patch('src.services.credibility_service.get_authorship_reputation', new_callable=AsyncMock) as mock_author: - - mock_domain.return_value = 80.0 - mock_citation.return_value = 70.0 - mock_journal.return_value = 90.0 - mock_recency.return_value = 50.0 - mock_author.return_value = 80.0 - - request = CredibilityRequest( - domain="example.com", - citation_doi="10.1234/example", - journal="Example Journal", - publication_date="2023-01-01", - author_id="0000-0001-2345-6789" - ) - - result = await calculate_credibility(request) - - assert result["authorship_reputation"]== 80.0 - -@pytest.mark.parametrize("score,expected_level", [ - (95, "Very High"), - (80, "High"), - (65, "Moderate"), - (50, "Low"), - (30, "Very Low"), - (100, "Very High"), - (0, "Very Low") -]) -def test_get_credibility_level(score, expected_level): - """Test credibility level mapping""" - assert get_credibility_level(score) == expected_level - -@pytest.mark.asyncio -async def test_get_domain_reputation(): - """Test domain reputation calculation""" - with patch('src.services.domain_reputation.requests.get') as mock_get, \ - patch('src.services.domain_reputation.rate_limit', new_callable=AsyncMock): - - # Mock Tranco API response - mock_get.return_value.json.return_value = {"example.com": 500} - - # Mock Observatory API responses - mock_get.side_effect = [ - # First call - start scan - type('MockResponse', (), {'status_code': 200, 'json': lambda: {"scan_id": "123"}}), - # Second call - get results - type('MockResponse', (), {'status_code': 200, 'json': lambda: {"state": "FINISHED", "grade": "A"}}) - ] - - score = await get_domain_reputation("example.com") - assert 0 <= score <= 100 - -@pytest.mark.asyncio -async def test_get_domain_reputation_invalid_domain(): - """Test domain reputation with invalid domain""" - with patch('src.services.domain_reputation.requests.get') as mock_get, \ - patch('src.services.domain_reputation.rate_limit', new_callable=AsyncMock): - - # Mock Tranco API response - mock_get.return_value.json.return_value = {} - - score = await get_domain_reputation("invalid-domain") - assert score == 0 - -@pytest.mark.asyncio -async def test_get_domain_reputation_api_error(): - """Test domain reputation when APIs fail""" - with patch('src.services.domain_reputation.requests.get') as mock_get, \ - patch('src.services.domain_reputation.rate_limit', new_callable=AsyncMock),\ - patch('src.services.domain_reputation.tranco_data') : - - # Mock API failure - mock_get.side_effect = Exception("API error") - - with pytest.raises(Exception, match="API error"): - await get_domain_reputation("example.com") - - - - - -@pytest.mark.asyncio -async def test_get_citation_data(): - """Test citation data retrieval""" - with patch('src.services.citation_data.requests.get') as mock_get, \ - patch('src.services.citation_data.rate_limit', new_callable=AsyncMock): - - # Mock Crossref API response - mock_get.return_value.json.return_value = {"message": {"is-referenced-by-count": 10}} - - score = await get_citation_data("10.1234/example") - assert 0 <= score <= 100 - -@pytest.mark.asyncio -async def test_get_citation_data_no_doi(): - """Test citation data with no DOI""" - score = await get_citation_data(None) - assert score == 0 - -@pytest.mark.asyncio -async def test_get_citation_data_api_error(): - """Test citation data when APIs fail""" - with patch('src.services.citation_data.requests.get') as mock_get, \ - patch('src.services.citation_data.rate_limit', new_callable=AsyncMock): - - # Mock API failure - mock_get.side_effect = Exception("API error") - - score = await get_citation_data("10.1234/example") - assert score == 0 - -@pytest.mark.asyncio -async def test_get_journal_impact_with_issn(): - """Test journal impact calculation with ISSN""" - with patch('src.services.journal_impact.requests.get') as mock_get, \ - patch('src.services.journal_impact.get_by_issn') as mock_issn, \ - patch('src.services.journal_impact.parse_item') as mock_parse, \ - patch('src.services.journal_impact.rate_limit', new_callable=AsyncMock) as mock_rate_limit: - - # Mock DOAJ API response - mock_rate_limit.return_value = None - - mock_get.return_value.json.return_value = {"total": 1, "results": [{"bibjson": {"title": "Example Journal"}}]} - mock_issn.return_value = """{ - 'data': { - 'Country': 'Switzerland', - 'CountryCode': 'sz', - 'KeyTitle': 'Frontiers in public health', - 'ISSN': {'status': 'Valid', 'value': '2296-2565'}, - 'Organization': '40', - 'Record': {'modified': '20210207002300.0', 'status': 'Register'}, - 'errors': [], - 'issn': '2296-2565', } - 'resource': {'URL': 'https://www.frontiersin.org/journals/public-health#articles'}, - } - } - """ - mock_parse.return_value = { - 'data': { - 'Country': 'Switzerland', - 'CountryCode': 'sz', - 'ISSN': {'status': 'Valid', 'value': '2296-2565'}, - 'KeyTitle': 'Frontiers in public health', - 'Organization': '40', - 'Record': {'modified': '20210207002300.0', 'status': 'Register'}, - 'errors': [], - 'issn': '2296-2565', } , - 'resource': {'URL': 'https://www.frontiersin.org/journals/public-health#articles'}, - } - score = await get_journal_impact(issn="1234-5678") - assert 0 <= score <= 100 - assert mock_rate_limit.called - -@pytest.mark.asyncio -async def test_get_journal_impact_with_journal_name(): - """Test journal impact calculation with journal name""" - with patch('src.services.journal_impact.requests.get') as mock_get, \ - patch('src.services.journal_impact.rate_limit', new_callable=AsyncMock): - - # Mock DOAJ API response - mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = {"total": 1, "results": [{"bibjson": {"title": "Example Journal"}}]} - score = await get_journal_impact(journal="Example Journal") - assert 0 <= score <= 100 - -@pytest.mark.asyncio -async def test_get_journal_impact_with_both(): - """Test journal impact calculation with both ISSN and journal name""" - with patch('src.services.journal_impact.requests.get') as mock_get, \ - patch('src.services.journal_impact.get_by_issn') as mock_issn, \ - patch('src.services.journal_impact.parse_item') as mock_parse, \ - patch('src.services.journal_impact.rate_limit', new_callable=AsyncMock) as mock_rate_limit: - - # Mock DOAJ API response - mock_rate_limit.return_value = None - - mock_get.return_value.json.return_value = {"total": 1, "results": [{"bibjson": {"title": "Example Journal"}}]} - mock_issn.return_value = """{ - 'data': { - 'Country': 'Switzerland', - 'CountryCode': 'sz', - 'KeyTitle': 'Frontiers in public health', - 'ISSN': {'status': 'Valid', 'value': '2296-2565'}, - 'Organization': '40', - 'Record': {'modified': '20210207002300.0', 'status': 'Register'}, - 'errors': [], - 'issn': '2296-2565', } - 'resource': {'URL': 'https://www.frontiersin.org/journals/public-health#articles'}, - } - """ - mock_parse.return_value = { - 'data': { - 'Country': 'Switzerland', - 'CountryCode': 'sz', - 'ISSN': {'status': 'Valid', 'value': '2296-2565'}, - 'KeyTitle': 'Frontiers in public health', - 'Organization': '40', - 'Record': {'modified': '20210207002300.0', 'status': 'Register'}, - 'errors': [], - 'issn': '2296-2565', } , - 'resource': {'URL': 'https://www.frontiersin.org/journals/public-health#articles'}, - } - score = await get_journal_impact(issn="1234-5678", journal="Example Journal") - assert 0 <= score <= 100 - assert mock_rate_limit.called - -@pytest.mark.asyncio -async def test_get_journal_impact_no_issn_or_journal(): - """Test journal impact with no ISSN or journal name""" - score = await get_journal_impact() - assert score == 0 - -@pytest.mark.asyncio -async def test_get_journal_impact_invalid_journal_name(): - """Test journal impact with invalid journal name""" - with patch('src.services.journal_impact.requests.get') as mock_get, \ - patch('src.services.journal_impact.rate_limit', new_callable=AsyncMock): - - # Mock DOAJ API response - mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = {"total": 0} - - score = await get_journal_impact(journal="Invalid Journal") - assert score == 0 - -# @pytest.mark.skip(reason="DOAJ API is not working") -@pytest.mark.asyncio -async def test_get_journal_impact_api_error(): - """Test journal impact when APIs fail""" - with patch('src.services.journal_impact.requests.get') as mock_get, \ - patch('src.services.journal_impact.rate_limit', new_callable=AsyncMock) as mock_issn,\ - patch('src.services.journal_impact.get_by_issn', new_callable=AsyncMock): - - # Mock API failure - mock_get.side_effect = Exception("API error") - mock_issn.side_effect = Exception("API error") - - with pytest.raises(Exception ): - await get_journal_impact("example.com") - - - -@pytest.mark.asyncio -async def test_calculate_recency_score(): - """Test recency score calculation""" - # Test various date formats - assert await calculate_recency_score("2023-01-01") > 0 - assert await calculate_recency_score(2023) > 0 - assert await calculate_recency_score("January 2023") > 0 - -@pytest.mark.asyncio -async def test_calculate_recency_score_invalid_date(): - """Test recency score with invalid date""" - assert await calculate_recency_score("invalid-date") == 0 - -@pytest.mark.asyncio -async def test_calculate_recency_score_future_date(): - """Test recency score with future date""" - assert await calculate_recency_score("2030-01-01") == 0 - -@pytest.mark.asyncio -async def test_get_authorship_reputation(): - """Test author reputation calculation""" - with patch('src.services.author_reputation.requests.get') as mock_get, \ - patch('src.services.author_reputation.rate_limit', new_callable=AsyncMock): - - # Mock ORCID API response - mock_get.return_value.json.return_value = {"group": [{}]} - - score = await get_authorship_reputation(author_id="0000-0001-2345-6789") - assert 0 <= score <= 100 - -@pytest.mark.asyncio -async def test_get_authorship_reputation_no_author(): - """Test author reputation with no author info""" - score = await get_authorship_reputation() - assert score == 0 - -@pytest.mark.asyncio -async def test_get_authorship_reputation_api_error(): - """Test author reputation when APIs fail""" - with patch('src.services.author_reputation.requests.get') as mock_get, \ - patch('src.services.author_reputation.rate_limit', new_callable=AsyncMock): - - # Mock API failure - mock_get.side_effect = Exception("API error") - - score = await get_authorship_reputation(author_id="0000-0001-2345-6789") - assert score >= 20 # Minimum score for having author info diff --git a/backend/metricsService/tests/utils/test_api_utils.py b/backend/metricsService/tests/utils/test_api_utils.py deleted file mode 100644 index 33369bd..0000000 --- a/backend/metricsService/tests/utils/test_api_utils.py +++ /dev/null @@ -1,47 +0,0 @@ -import pytest -import time -from unittest.mock import AsyncMock -from src.utils.api_utils import rate_limit, retry_on_failure - -@pytest.mark.asyncio -async def test_rate_limit(): - """Test rate limiting functionality""" - # First call should proceed immediately - start_time = time.time() - await rate_limit() - first_call_duration = time.time() - start_time - assert first_call_duration < 0.1 # Should be almost instant - - # Second call should be delayed - start_time = time.time() - await rate_limit() - second_call_duration = time.time() - start_time - assert second_call_duration >= 1.0 # Should be delayed by at least 1 second - -@pytest.mark.asyncio -async def test_retry_on_failure_success(): - """Test retry_on_failure decorator with successful operation""" - mock_func = AsyncMock(return_value="success") - - @retry_on_failure(max_retries=3, delay=0.1) - async def test_func(): - return await mock_func() - - result = await test_func() - - assert result == "success" - mock_func.assert_called_once() # Should succeed on first try - -@pytest.mark.asyncio -async def test_retry_on_failure_failure(): - """Test retry_on_failure decorator with failing operation""" - mock_func = AsyncMock(side_effect=[Exception("Failed"), Exception("Failed"), "success"]) - - @retry_on_failure(max_retries=3, delay=0.1) - async def test_func(): - return await mock_func() - - result = await test_func() - - assert result == "success" - assert mock_func.call_count == 3 # Should succeed on third try diff --git a/backend/metricsService/tests/utils/test_cache.py b/backend/metricsService/tests/utils/test_cache.py deleted file mode 100644 index 25ffb90..0000000 --- a/backend/metricsService/tests/utils/test_cache.py +++ /dev/null @@ -1,79 +0,0 @@ -import pytest -from unittest.mock import AsyncMock, patch -from src.utils.cache import get_cache, set_cache - -@pytest.mark.asyncio -async def test_get_cache_miss(): - """Test get_cache when cache miss occurs""" - # Test with a non-existent key - result = await get_cache("non_existent_key") - assert result is None - -@pytest.mark.asyncio -async def test_set_get_cache(): - """Test setting and getting cache values""" - # Test data - test_key = "test_key" - test_value = {"data": "test_value"} - - # Set the cache value - await set_cache(test_key, test_value) - - # Get the cached value - result = await get_cache(test_key) - - # Verify the result - assert result is not None - assert result == test_value - assert isinstance(result, dict) - assert result["data"] == "test_value" - -@pytest.mark.asyncio -async def test_set_get_cache_with_expiry(): - """Test setting and getting cache values with expiration""" - test_key = "test_key_expiry" - test_value = {"data": "test_value"} - expiry = 60 # 60 seconds - - # Set the cache value with expiry - await set_cache(test_key, test_value, expiry) - - # Get the cached value - result = await get_cache(test_key) - - # Verify the result - assert result is not None - assert result == test_value - -@pytest.mark.asyncio -async def test_set_cache_invalid_value(): - """Test setting cache with invalid value""" - test_key = "test_key_invalid" - test_value = None - - # Set the cache value - await set_cache(test_key, test_value) - - # Get the cached value - result = await get_cache(test_key) - - # Verify the result - assert result is None - -@pytest.mark.asyncio -async def test_set_get_cache_multiple_values(): - """Test setting and getting multiple cache values""" - test_data = [ - ("key1", {"data": "value1"}), - ("key2", {"data": "value2"}), - ("key3", {"data": "value3"}) - ] - - # Set multiple cache values - for key, value in test_data: - await set_cache(key, value) - - # Get and verify each cached value - for key, expected_value in test_data: - result = await get_cache(key) - assert result == expected_value diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index fd369d7..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,36 +0,0 @@ -version: '3.8' - -services: - main_service: - build: - context: ./backend/mainService - dockerfile: Dockerfile - ports: - - "9020:8000" - env_file: - - ./backend/mainService/.env - environment: - - CREDIBILITY_API_URL=http://metrics_service:8000/api/v1/credibility/batch - volumes: - - ./backend/mainService:/app - networks: - - cite_me - depends_on: - - metrics_service - - metrics_service: - build: - context: ./backend/metricsService - dockerfile: Dockerfile - ports: - - "9050:8000" - env_file: - - ./backend/metricsService/.env - volumes: - - ./backend/metricsService:/app - networks: - - cite_me - -networks: - cite_me: - driver: bridge \ No newline at end of file diff --git a/frontend/vercel.json b/frontend/vercel.json new file mode 100644 index 0000000..e6aa184 --- /dev/null +++ b/frontend/vercel.json @@ -0,0 +1,6 @@ +{ + "routes": [ + { "src": "/(.*)", "dest": "/index.html" } + ] + } + \ No newline at end of file From 26d33a64218eefeb38823395019f3cb57382aeb6 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 16:35:53 -0400 Subject: [PATCH 02/11] updated vercel.json with rewrites --- frontend/vercel.json | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/frontend/vercel.json b/frontend/vercel.json index e6aa184..8b66688 100644 --- a/frontend/vercel.json +++ b/frontend/vercel.json @@ -1,6 +1,7 @@ { - "routes": [ - { "src": "/(.*)", "dest": "/index.html" } + "rewrites": [ + { "source": "/editor", "destination": "/index.html" }, + { "source": "/preview", "destination": "/index.html" } ] } \ No newline at end of file From 70e538c1f195d2fd4d99f6d1bd47411ef49ee8f3 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 18:05:57 -0400 Subject: [PATCH 03/11] increased document title length to 150 --- frontend/src/components/MainPageHeader.vue | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/components/MainPageHeader.vue b/frontend/src/components/MainPageHeader.vue index f2d985c..776a958 100644 --- a/frontend/src/components/MainPageHeader.vue +++ b/frontend/src/components/MainPageHeader.vue @@ -85,7 +85,7 @@ const toggleView = () => { type="text" placeholder="Untitled" required - maxlength="50" + maxlength="150" /> From ee1ab02dc6c3c1499cef0335f32e28ca74fad1ba Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 18:19:14 -0400 Subject: [PATCH 04/11] update gitattributes to select ccurrent backend on merge conflict --- .gitattributes | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitattributes b/.gitattributes index 38cd5be..98f436d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ .gitignore merge=ours README.md merge=ours docker-compose.yml merge=ours +backend merge=ours From 4bb35753e108b4d52f11128e9a658ff50ec2cbbc Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 18:22:54 -0400 Subject: [PATCH 05/11] update gitattributes to select ccurrent backend on merge conflict --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 98f436d..40e3f02 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,4 @@ .gitignore merge=ours README.md merge=ours docker-compose.yml merge=ours -backend merge=ours +backend/ merge=ours From 4be42beb57c42b9c2a18b65e47142be40077a4f9 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 18:24:32 -0400 Subject: [PATCH 06/11] update gitattributes to select the current backend on merge conflict --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 40e3f02..0626369 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,4 +1,4 @@ .gitignore merge=ours README.md merge=ours docker-compose.yml merge=ours -backend/ merge=ours +backend/** merge=ours From 0c5efe71d80dccf5f21aed6534b3c5867aa64809 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 18:33:18 -0400 Subject: [PATCH 07/11] Merged vercel-dev while keeping main's backend files --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index cff803a..7233e8f 100644 --- a/.gitignore +++ b/.gitignore @@ -61,8 +61,6 @@ testing_workflow.py local.settings.json playwright_browser __pycache__ -backend/ -.github/ docker-compose.yml scripts/ From a8a45693a27f0928afeca5975332c559eb4f5d9e Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 19:26:55 -0400 Subject: [PATCH 08/11] delete readme in metric service --- backend/metricsService/README.md | Bin 5238 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 backend/metricsService/README.md diff --git a/backend/metricsService/README.md b/backend/metricsService/README.md deleted file mode 100644 index 04ae6d996f6f9f3ae4204e3fa4d97089c446b3d4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5238 zcmchb*=`&~6o%^>iFau1o5W)5*p3jg#6>cWNx&At9uSMr?6xQIEX?BALc9bI4E*0Y zT~pmN9xuT{qweXhI?I2SI(4f5`FlNNp&EK&8k)L#VHgHBG7jC)(R)kRh0R}U{6wSU zaH;pUo<0qq*!4m)2jZVd!i(@Wya^TY@s5nl{u5UgPNik6@wR9#bf4+&+6Upx#&6a> z5iOiuNzC+gs(aIAy#rAo*RALqF87S%oSDq2P=n0l2JVP+W*Qvxskq65 zru6m1L5!j8hMl>FOf%rY=F1$-NdMykf`NG>+IDQ1z=#b*8h^OPXeIVcC4)Te>lv-g z^@OLdE6S=$t84EXF03bFNCyL`$WG_(m_{_g&*gc!eB)7Y znH94L9KQ>5&k*6GfiSX0rr}B#))nRRzMESGFY5-onucK0_%DTo%<-(t!UsKJw_BaG zQ3ue+%D}?|ixK)WjWspa)4e4R#8vs+G#?8uITw9NUU?>yS+Qc>$2T+6?G*%x6X|SA z2lsK8Ydwo2Wi8K` zD3cXQ1c&k~HIH=)Ui?N4#&^rovhYOXU|_MNG#Xcb&I@mHFY{^T+ab{xdS__{S_^1KJ9lCCkdg68&+u@lau&o-q z8=lMZvdprqcfx>22hw^mm+-2DA`9OvP!-XP#Lr4415+Hp#9MbRb4RE^RMG!-elKHb zN)p*a#jQ(TXU_XfJpABY{dAi&?4!m-A7RnCd9Bzb zK5Ch-PM1aw{8`A2xYo&b&-h{|@c(Cc-FmgDVtKB0w2bbxY$H>M6Z;!`c{{w%E&5t( z@*%4Z?kH_h+hWb|IzZRSzV5korqu>UI+G)fky7N9|PR?G*?H(*Hq}r-q~4r(*M4^8?o1 z*Jd;?YWHff$_F#+!r9ZW%zH5>9-{)5dsknZzgk4S^T-PIGLC#~-Aw$CS3i@?|5`_r zq{phKN#bMH)m2g-QeTsdkFB#s33u1qNbPl(x8E$sZmPp%_4nF`=%YA0u)?S|>>cc8 zxY&$g?-Y%?zW8EQ%Zo*eMJVmK~exnvm zatVvwB&V2x_1$Np@apItWn8UQje_SPJ$vj8i+yLT!pmK{cirgou7{_p`mkf|3HAI- zKs|h`x1YmT@~!tBbXrtSdM@wO>Z-ThysjrnNQ*mURg~1ch~;kUYqn2iIf_g^qIYH| zxc!8h`ebxfEDQREm)u~dDB@^Y#t^v_>rK%`yzD9Ho$BY6K|Jf-;#j~|4A5_C@i%Cs;^`W9g*I2{|I`X|DoI~B|?buOc*N+X3vtvHIDZh8a z7uKmgo$t>L=}5Yd^!$@zexmO~zX*+^U=P1C?VLUqPa*yVBzoOFL!ZVe$anR!@3D(s zn4jZ_S;T7NB+2&x|87u|6uS7dvTCxOb89?t#`)w?&(7kxd!!5i50c6&A!**%Jl*|J zwDHu0kV72;Yp_k`%wgVcjC%Z?OhI>Y7Je*xvwFNFXA From 72c244d3193c5e4935bc09940b49ca7ae095627b Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 19:37:01 -0400 Subject: [PATCH 09/11] update docker files. Update Readme --- README.md | 62 ++++++++++++++++++++++++------- backend/mainService/Dockerfile | 15 ++++---- backend/metricsService/Dockerfile | 18 +++------ 3 files changed, 62 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 38ac21e..d5e0726 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,11 @@ # CiteMe - Automatic Citation Generation System -CiteMe is a modern, full-stack application designed to help students generate references and in-line citations and references efficiently. The system provides intelligent citation suggestions, reference management, and seamless integration with academic databases. +CiteMe is a modern, full-stack application designed to help students generate references and in-line citations efficiently. The system provides intelligent citation suggestions, reference management, and seamless integration with academic databases. -Students do not have to worry about searching for sources to back essays and thesis. This web app will search the web , format your document with intext citation and include the references, sources and metrics to grade the credibility of the sources. +Students do not have to worry about searching for sources to back essays and thesis. This web app will search the web, format your document with intext citation and include the references, sources and metrics to grade the credibility of the sources. The webapp also offers the choice of providing your own sources, in forms of urls, texts and pdfs and is able to use these sources to format your essays/thesis with intext citation and references in any citation format. - 🌐 **Live Demo**: [CiteMe Editor](https://cite-me-wpre.vercel.app/editor) ## 🚀 Features @@ -17,6 +16,10 @@ The webapp also offers the choice of providing your own sources, in forms of url - **Real-time Metrics**: Track citation impact and academic metrics - **Modern UI**: Responsive and intuitive user interface - **API Integration**: Seamless integration with academic databases and search engines +- **Web Scraping**: Intelligent web scraping with Playwright for source extraction +- **Vector Search**: Efficient document retrieval using Pinecone vector database +- **AI-Powered**: Integration with multiple AI models (Azure, Groq, Gemini) for citation generation +- **Credibility Scoring**: Automated source credibility assessment ## 📁 Project Structure @@ -29,10 +32,13 @@ CiteMe/ │ └── dist/ # Production build ├── backend/ │ ├── mainService/ # Core citation service -│ └── metricsService/ # Analytics and metrics service -├── .github/ # GitHub workflows and templates -├── docker-compose.yml # Docker services configuration -└── README.md # Project documentation +│ │ ├── src/ # Source code +│ │ ├── scripts/ # Utility scripts +│ │ └── config/ # Configuration files +│ └── metricsService/ # Analytics and metrics service +├── .github/ # GitHub workflows and templates +├── docker-compose.yml # Docker services configuration +└── README.md # Project documentation ``` ## 🏗️ Architecture @@ -41,7 +47,14 @@ The application is built using a microservices architecture with three main comp 1. **Frontend Service**: Vue.js 3 application hosted on Vercel 2. **Main Service**: FastAPI-based backend service handling core citation functionality + - Web scraping with Playwright + - Vector search with Pinecone + - AI model integration (Azure, Groq, Gemini) + - Citation generation and formatting 3. **Metrics Service**: FastAPI-based service for handling academic metrics and analytics + - Source credibility assessment + - Citation impact analysis + - Academic metrics tracking ## 🛠️ Tech Stack @@ -56,12 +69,34 @@ The application is built using a microservices architecture with three main comp ### Backend - Python 3.11 - FastAPI -- Pinecone -- Gemini +- Pinecone (Vector Database) +- Gemini (Google AI) +- Groq - Azure hosted LLMs +- Mixbread (Reranking) - LangChain +- Playwright (Web Scraping) - Various AI/ML libraries +## 🔑 Environment Setup + +Before running the services, you'll need to set up the following API keys: + +1. Google API Keys: + - `CX`: Google Programmable Search Engine ID + - `GPSE_API_KEY`: Google Programmable Search Engine API key + - `GOOGLE_API_KEY`: Gemini API key + +2. AI Service Keys: + - `GROQ_API_KEY`: Groq API key + - `PINECONE_API_KEY`: Pinecone vector database + - `MIXBREAD_API_KEY`: Mixbread reranking service + - `AZURE_MODELS_ENDPOINT`: Azure endpoint for citation generation + +3. Optional Services: + - `CREDIBILITY_API_URL`: URL for the credibility metrics service + - `SERVERLESS`: Set to TRUE for serverless mode + ## 🚀 Getting Started ### Prerequisites @@ -78,9 +113,10 @@ git clone https://github.com/yourusername/citeme.git cd citeme ``` -2. Create `.env` files in both service directories: - - `backend/mainService/.env` - - `backend/metricsService/.env` +2. Create a `.env` file in the root directory with all required API keys: +```bash +cp backend/mainService/.env.example .env +``` 3. Build and run the services using Docker Compose: ```bash @@ -178,7 +214,7 @@ pytest The backend services have their own Dockerfiles: -- `backend/mainService/Dockerfile`: Python-based main service +- `backend/mainService/Dockerfile`: Python-based main service with Playwright support - `backend/metricsService/Dockerfile`: Python-based metrics service ## 🤝 Contributing diff --git a/backend/mainService/Dockerfile b/backend/mainService/Dockerfile index 084364c..db7cd05 100644 --- a/backend/mainService/Dockerfile +++ b/backend/mainService/Dockerfile @@ -2,12 +2,13 @@ FROM python:3.11-slim WORKDIR /app -# Install system dependencies +# Install system dependencies including Playwright requirements # Installs essential tools for compiling software from source, often needed for Python package dependencies.(build-essential) # Removes the package lists downloaded during the update to reduce the image size. RUN apt-get update && apt-get install -y \ build-essential \ cron \ + wget \ && rm -rf /var/lib/apt/lists/* # Set the PATH environment variable to include /app @@ -19,18 +20,18 @@ COPY requirements.txt . # Install Python dependencies RUN pip install --no-cache-dir -r requirements.txt +# Install Playwright and its dependencies +RUN playwright install && playwright install-deps + +# Create necessary directories +RUN mkdir -p /app/config /tmp/downloads + # Copy the source code COPY ./scripts/ /app/scripts/ COPY ./src/ /app/src/ COPY ./app.py /app/app.py COPY ./__init__.py /app/__init__.py -# Create a directory for runtime configuration -RUN mkdir -p /app/config - -# Install playwright -RUN playwright install && playwright install-deps - # Expose the port the app runs on EXPOSE 8000 diff --git a/backend/metricsService/Dockerfile b/backend/metricsService/Dockerfile index 5cf1198..a79f36e 100644 --- a/backend/metricsService/Dockerfile +++ b/backend/metricsService/Dockerfile @@ -3,28 +3,20 @@ FROM python:3.11-slim WORKDIR /app # Install system dependencies -# Installs essential tools for compiling software from source, often needed for Python package dependencies.(build-essential) -# Removes the package lists downloaded during the update to reduce the image size. RUN apt-get update && apt-get install -y \ build-essential \ && rm -rf /var/lib/apt/lists/* -# Set the PATH environment variable to include /app -ENV PATH="/app:${PATH}" - -# Copy requirements first to leverage Docker cache +# Copy requirements first COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application +# Copy the application COPY ./src/ /app/src/ -RUN cd /app/src +# Create necessary directories +RUN mkdir -p /app/config -# Expose the port the app runs on EXPOSE 8000 -# Command to run the application CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file From 022cf9230d6bc30c880f6bba4ee2de89a7773ce5 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 19:43:35 -0400 Subject: [PATCH 10/11] includes the workflow file --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index d5e0726..3ec168a 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,18 @@ cd ../metricsService pytest ``` +## 🔄 CI/CD Pipeline + +The project uses GitHub Actions for continuous integration and deployment: + +- **Automated Testing**: Runs on every push to main and pull requests +- **Python 3.11**: Uses the latest Python 3.11 environment +- **Test Dependencies**: Installs both main and test requirements +- **PR Management**: Automatically closes failed PRs with explanatory comments +- **Environment Variables**: Securely manages API keys and configuration + +The pipeline can be found in `.github/workflows/python-ci-cd.yml`. + ## 📦 Docker Images The backend services have their own Dockerfiles: From f123c84f58a7ada7a0710c98be642a3400beb346 Mon Sep 17 00:00:00 2001 From: Ikeoluwa Oladele Date: Sat, 29 Mar 2025 19:44:25 -0400 Subject: [PATCH 11/11] includes the workflow file --- .github/workflows/python-ci-cd.yml | 74 ++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/python-ci-cd.yml diff --git a/.github/workflows/python-ci-cd.yml b/.github/workflows/python-ci-cd.yml new file mode 100644 index 0000000..1038a47 --- /dev/null +++ b/.github/workflows/python-ci-cd.yml @@ -0,0 +1,74 @@ +name: Python CI/CD + +env: + GPSE_API_KEY: ArzaSdB1_DhfPRKfJMdY6dp8duWdQTKQdC2xxkwc + GROQ_API_KEY: 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 + CX: 3afe68fe44e8d4620 + MIXBREAD_API_KEY: emb_f838d0f14ue7d61907d7f28fd643s8eaf49c0da2wf32f22a + PINECONE_API_KEY: pcsk_7LufHa_aUYWm5r5WwF1LBhfujiKftHWLX9iU6fyYxtkDukMnZZQKMWQJcXrFmhzt7GtVtJ + AZURE_API_KEY: 1JCm7aFbY2zVyXndOwAaljohGFAeFKjvwmDLa200gjSdlsLOqP3yJQQJ99BBACREanaXJ3w3AbgAACOG2ZyA + GOOGLE_API_KEY: Adzac4B4-q3u3Q_lssqr_dc7k-WM28ygszsVrIe + CREDIBILITY_API_URL: https://credibility-api.example.com + SERVERLESS: FALSE + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +# Add permissions configuration +permissions: + pull-requests: write + issues: write + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies for mainService + run: | + cd backend/mainService + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-test.txt + + - name: Install dependencies for metricService + run: | + cd backend/metricsService + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install -r requirements-test.txt + + - name: Run tests + run: | + cd backend/mainService + python -m pytest + cd ../metricsService + python -m pytest + + - name: Close failed PR + if: failure() && github.event_name == 'pull_request' + uses: actions/github-script@v6 + with: + script: | + await github.rest.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: context.issue.number, + state: 'closed' + }); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: 'This PR was automatically closed because the CI pipeline failed.' + }); \ No newline at end of file