From c6b70103924f8cad2aa267314aa50b562b00376b Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 8 Oct 2025 13:59:07 +0300
Subject: [PATCH 1/7] add python cache folders to the git ignore list

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 53efcb5..da683f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .apm
 Build/
+__pycache__

From ccae913357e439f6b1d91b3e89ab39328710a6f7 Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 8 Oct 2025 13:59:39 +0300
Subject: [PATCH 2/7] write the mcp server code in flask

---
 mcp/__init__.py |   0
 mcp/app.py      | 114 ++++++++++++++++++++++++++++++++++++++++++++++++
 mcp/tools.json  |  17 ++++++++
 3 files changed, 131 insertions(+)
 create mode 100644 mcp/__init__.py
 create mode 100644 mcp/app.py
 create mode 100644 mcp/tools.json

diff --git a/mcp/__init__.py b/mcp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mcp/app.py b/mcp/app.py
new file mode 100644
index 0000000..6644e92
--- /dev/null
+++ b/mcp/app.py
@@ -0,0 +1,114 @@
+import json
+
+from flask import Flask, request, jsonify
+
+from copilot_server.agents import BasicExamplesRetriever
+
+app = Flask(__name__)
+
+global_id = 0
+BASIC_VDB_PATH = 'vdb_basic.index'
+BASIC_EXAMPLES_INDEX_PATH = 'index.json'
+
+basic_examples_retriever = BasicExamplesRetriever(BASIC_VDB_PATH, BASIC_EXAMPLES_INDEX_PATH, debug=True)
+
+
+with open("mcp/tools.json", "r", encoding="utf-8") as tools_file:
+    TOOLS  = json.load(tools_file)
+
+
+def initialize_mcp(id):
+    return jsonify({
+        "jsonrpc": "2.0",
+        "id": id,
+        "result": {
+            "protocolVersion": "2025-06-18",
+            "serverInfo": {"name": "alusus-docs", "version": "1.0.0"},
+            "capabilities": {"tools": {"listChanged": True}, "resources": {}}
+        }
+    })
+
+
+def retrieve_docs(query):
+    docs = basic_examples_retriever.get_docs(query, k=10)
+    
+    return docs
+
+
+def get_tool(tool_name):
+    if tool_name == 'alusus_docs_retrieve':
+        return retrieve_docs
+    else:
+        return None
+
+
+@app.route('/mcp', methods=['POST'])
+def mcp():
+    payload = request.get_json(force=True)
+    print(f'payload: {payload}')
+
+    jsonrpc = payload.get("jsonrpc")
+    method  = payload.get("method")
+    id_     = payload.get("id")
+
+    if method == 'initialize':
+        return initialize_mcp(id_)
+    
+    if method == 'notifications/initialized':
+        return jsonify({
+            "jsonrpc": "2.0",
+            "method": "notifications/initialized",
+            "params": {}
+        }, 200)
+
+    if method == 'tools/list':
+        return jsonify({
+            "jsonrpc": "2.0",
+            "id": id_,
+            "result": {"tools": TOOLS}
+        })
+    
+    if method == 'ping':
+        return jsonify({
+            "jsonrpc": "2.0",
+            "id": id_,
+            "result": {}
+        })
+    
+    if method == 'tools/call':
+        params = payload.get("params")
+        if params is not None:
+            tool_name = params.get("name")
+            args = params.get("arguments")
+
+            tool = get_tool(tool_name)
+
+            if tool is None:
+                return ("Bad Request, invalid tool name", 400)
+
+            results = tool(**args)
+
+            content = [{"type": "text", "text": item} for item in results]
+
+            return jsonify({
+                "jsonrpc": "2.0",
+                "id": id_,
+                "result": {
+                    "content": content
+                }
+            })
+        else:
+            return ("Bad Request, include required parameters to invoke the tool", 400)
+
+
+@app.route('/query', methods=['POST'])
+def handle_query():
+    data = request.json
+
+    print(f'data:\n {data}')
+
+    return jsonify({"response": "message reached the server!!"})
+
+
+if __name__ == '__main__':
+    app.run(debug=True)
diff --git a/mcp/tools.json b/mcp/tools.json
new file mode 100644
index 0000000..53a6859
--- /dev/null
+++ b/mcp/tools.json
@@ -0,0 +1,17 @@
+[
+    {
+        "name": "alusus_docs_retrieve",
+        "title": "Alusus Docs",
+        "description": "Retrieve documentation for a given query about Alusus programming language",
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "query": {
+                    "type": "string",
+                    "description": "The user question about Alusus programming language"
+                }
+            },
+            "required": ["query"]
+        }
+    }
+]

From 13dbd8d8f67f581e19fd4365636294746a6b6bb9 Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 8 Oct 2025 14:00:38 +0300
Subject: [PATCH 3/7] configure opencode to use the mcp test server

---
 opencode/config.json | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 opencode/config.json

diff --git a/opencode/config.json b/opencode/config.json
new file mode 100644
index 0000000..53172f5
--- /dev/null
+++ b/opencode/config.json
@@ -0,0 +1,39 @@
+{
+  "$schema": "https://opencode.ai/config.json",
+  "agent": {
+    "alusus-server": {
+      "mode": "subagent",
+      "description": "Posts user input to my HTTP endpoint and returns the JSON response",
+      "model": "openai/o4-mini",
+      "prompt": "Take the user’s question, POST `{ \"input\": \"<question>\" }` to http://127.0.0.1:5000/query , then return the parsed JSON."
+    },
+
+    "alusus-simple": {
+      "mode": "subagent",
+      "description": "Clean the user query and retrieve docs from the correct mcp",
+      "prompt": "When the user asks about the Alusus programming language, Clean the user’s question by removing filler words, trailing punctuation, and convert it to a concise form. Then take it and, POST `{ \"query\": \"<question>\" }` to http://127.0.0.1:5000/retrieve-docs. Once the tool returns, compose a final answer grounded in the returned 'docs' field.",
+      "model": "openai/o4-mini"
+    },
+
+    "alusus-docs": {
+      "mode": "subagent",
+      "description": "Posts user input to my mcp endpoint and returns the JSON response",
+      "model": "openai/o4-mini",
+      "prompt": "Use the mcp \"alusus-docs\" to retrieve docs through the tool `alusus_docs_retrieve`, the mcp use http, so do the initialization and all steps required for that."
+    },
+  },
+
+
+  "mcp": {
+    "alusus-docs": {
+      "type": "remote",
+      "url": "http://127.0.0.1:5000/mcp",
+      "enabled": true
+    }
+  },
+
+  "tools": {
+    // Enable all tools from the docs server
+    "alusus-docs_*": true
+  }
+}

From 13946cbbe499957b8e3dd6f94332e079e93f5c7e Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 14 Jan 2026 14:22:07 +0300
Subject: [PATCH 4/7] implement a scraper for the website and github repos of
 Alusus

---
 data/scraper.py | 248 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 248 insertions(+)
 create mode 100644 data/scraper.py

diff --git a/data/scraper.py b/data/scraper.py
new file mode 100644
index 0000000..87da1cf
--- /dev/null
+++ b/data/scraper.py
@@ -0,0 +1,248 @@
+import re
+import json
+import requests
+from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+from bs4.element import Tag
+from bs4 import BeautifulSoup
+
+
+with open("./config.json", encoding="utf-8") as f:
+    cfg = json.load(f)
+
+
+def download_json(url: str) -> dict:
+    if not url:
+        return None
+
+    response = requests.get(url)
+    response.raise_for_status()
+    return response.json()
+
+
+def fetch_text(url: str) -> str:
+    try:
+        r = requests.get(url)
+        if r.status_code == 200:
+            return r.text
+    except Exception:
+        pass
+    return None
+
+
+def get_default_branch(repo_url: str) -> str:
+    # Extract owner and repo from URL
+    match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
+    if not match:
+        raise ValueError("Invalid GitHub repository URL")
+
+    owner, repo = match.group(1), match.group(2)
+
+    api_url = f"https://api.github.com/repos/{owner}/{repo}"
+
+    headers = {"Accept": "application/vnd.github+json"}
+    token = cfg.get("GITHUB_TOKEN")
+    if token:
+        headers["Authorization"] = f"Bearer {token}"
+    
+    response = requests.get(api_url, headers=headers)
+
+    if response.status_code != 200:
+        raise RuntimeError(f"GitHub API error: {response.status_code} - {response.text}")
+
+    data = response.json()
+    return data.get("default_branch", "main")  # Fallback to `main`
+
+
+def path_to_raw_url(path: str, repo_url: str) -> str:
+    default_branch = get_default_branch(repo_url)
+    view_url = repo_url.rstrip("/") + f"/blob/{default_branch}/" + path.lstrip("/")
+    raw_url = view_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
+
+    return raw_url
+
+
+def get_readme_url(repo_url: str) -> str:
+    return path_to_raw_url("/readme.md", repo_url)
+
+
+def extract_subpage_links(md_text: str) -> list[tuple[str, str]]:
+    pattern = r"\[\[([^\]]+)\]\]\(([^)]+)\)"
+    matches = re.findall(pattern, md_text)
+    return matches
+
+
+def process_library(item: dict) -> dict:
+    lib_name = item.get("name")
+    repo_url = item.get("url")
+
+    lib_meta = {
+        "name": lib_name,
+        "repo_url": repo_url,
+        "description": "",
+        "content": None,
+        "subpages": {}
+    }
+
+    if not repo_url:
+        return lib_meta
+
+    # Find README
+    readme_url = get_readme_url(repo_url)
+    if not readme_url:
+        return lib_meta
+
+    readme_text = fetch_text(readme_url)
+    lib_meta["content"] = readme_text
+
+    if not readme_text:
+        return lib_meta
+
+    # Extract subpages
+    subpages = extract_subpage_links(readme_text)
+
+    for link_text, sub_url in subpages:
+        if sub_url == "readme.ar.md":
+            continue
+
+        # Convert relative links to absolute
+        if not sub_url.startswith("http"):
+            sub_url = path_to_raw_url(sub_url, repo_url)
+
+        content = fetch_text(sub_url)
+        if content:
+            lib_meta["subpages"][link_text] = {
+                "url": sub_url,
+                "description": "",
+                "content": content,
+            }
+
+    return lib_meta
+
+
+def is_inside(el, container):
+    parent = el
+    while parent:
+        if parent is container:
+            return True
+        parent = parent.parent
+    return False
+
+
+def process_webpage_doc(url: str) -> dict:
+    html = fetch_text(url)
+    soup = BeautifulSoup(html, "html.parser")
+
+    sidebar = soup.select_one(".col-md-3 .card.side-sticky .card-content")
+    if not sidebar:
+        raise ValueError("Sidebar not found in the HTML page")
+
+    # Extract ordered list of (id, title) from sidebar
+    nav_items = []
+    for a in sidebar.select("a[href^='#']"):
+        href = a.get("href")
+        if href and href.startswith("#"):
+            section_id = href[1:]  # remove '#'
+            title = a.get_text(strip=True)
+            nav_items.append((section_id, title))
+
+    if not nav_items:
+        return {}
+
+    content_container = soup.select_one(".col-md-9") or soup.body
+
+    sections = {}
+    for i, (section_id, title) in enumerate(nav_items):
+        start_el = soup.find(id=section_id)
+        if not start_el:
+            continue
+
+        # Determine the next section ID (if any)
+        next_id = nav_items[i + 1][0] if i + 1 < len(nav_items) else None
+
+        content_parts = []
+        started = False
+
+        # Walk the DOM in document order, including text nodes
+        for node in start_el.next_elements:
+            # Skip the starting element itself
+            if not started:
+                started = True
+                continue
+
+            # If we left the content container, stop
+            if isinstance(node, Tag) and not is_inside(node, content_container):
+                break
+
+            # Stop when we reach the next section anchor
+            if isinstance(node, Tag) and next_id and node.get("id") == next_id:
+                break
+
+            # if this node contains the next node, skip
+            if isinstance(node, Tag) and next_id:
+                inner_next = node.find(id=next_id)
+                if inner_next is not None:
+                    continue
+
+            content_parts.append(str(node))
+
+        sections[title] = {
+            "content": "".join(content_parts),
+            "description": "", # to be filled later using AI models
+        }
+
+    return sections
+
+
+def parallel_process_libraries(max_workers=5) -> dict:
+    print("\nDownloading libraries info...")
+    libraries = download_json(cfg.get("LIBRARIES_META_URL"))
+    print("\nDone Downloading libraries info...")
+
+    print("\nExtracting libraries meta information...")
+    def process(lib_info):
+        lib_name = lib_info.get("name")
+        data = process_library(lib_info)
+        return lib_name, data
+
+    libs_meta = {}
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = [executor.submit(process, lib_info) for lib_info in libraries]
+
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            lib_name, data = future.result()
+            libs_meta[lib_name] = data   # safe: done in main thread only
+
+    return libs_meta
+
+
+def process_basic() -> dict:
+    basic_meta = {}
+
+    print("\nExtracting website docs...")
+    for doc_url in tqdm(cfg.get("WEBSITE_DOCS_URLS")):
+        doc_sections = process_webpage_doc(doc_url)
+        basic_meta.update(doc_sections)
+
+    return basic_meta
+
+
+def save_json(d: dict, output_path: str):
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(d, f, indent=2, ensure_ascii=False)
+
+    print(f'Saved output to {output_path}')
+
+
+def main():
+    libs_meta = parallel_process_libraries()
+    save_json(libs_meta, cfg.get("LIBRARIES_OUTPUT_PATH"))
+
+    basic_meta = process_basic()
+    save_json(basic_meta, cfg.get("BASIC_OUTPUT_PATH"))
+
+
+if __name__ == "__main__":
+    main()

From c8cb2a3c3c4b1b17a446028cb145a26eb2df6dae Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 14 Jan 2026 14:27:21 +0300
Subject: [PATCH 5/7] add description to scraped data via AI models

---
 data/extract_func_cls_prompt.txt |  33 +++++++
 data/generate_description.py     | 147 +++++++++++++++++++++++++++++++
 data/summary_gen_prompt.txt      |  11 +++
 3 files changed, 191 insertions(+)
 create mode 100644 data/extract_func_cls_prompt.txt
 create mode 100644 data/generate_description.py
 create mode 100644 data/summary_gen_prompt.txt

diff --git a/data/extract_func_cls_prompt.txt b/data/extract_func_cls_prompt.txt
new file mode 100644
index 0000000..821e00c
--- /dev/null
+++ b/data/extract_func_cls_prompt.txt
@@ -0,0 +1,33 @@
+You are analyzing documentation for a software library. Your task is to extract all notable
+functions and classes described in the content.
+
+A "notable" function or class is one that:
+- is explicitly documented, OR
+- appears in code blocks, OR
+- is described as part of the library’s API.
+
+For each item, extract:
+- "type": either "function" or "class"
+- "description": a single concise sentence describing what it does
+- "declaration": the function signature or class declaration (with constructors) exactly as shown in the docs
+
+Rules:
+- Output ONLY valid JSON.
+- Do not add markdown such as ```json```.
+- Do not invent APIs that are not mentioned.
+- If the declaration is incomplete in the docs, extract the best available form.
+- Ignore examples that do not define new functions/classes.
+- For class declaration, the decalaration of the various constructors should be preseneted, not just the class name.
+
+Content:
+{{CONTENT}}
+
+Return JSON in the following format:
+
+[
+    {
+        "type": "function",
+        "description": "...",
+        "declaration": "..."
+    }
+]
diff --git a/data/generate_description.py b/data/generate_description.py
new file mode 100644
index 0000000..0346098
--- /dev/null
+++ b/data/generate_description.py
@@ -0,0 +1,147 @@
+import json
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pprint import pprint
+
+from openai import OpenAI
+from tqdm import tqdm
+
+
+with open("./config.json", encoding="utf-8") as f:
+    cfg = json.load(f)
+
+with open(cfg.get("SUMMARY_PROMPT_PATH"), encoding="utf-8") as f:
+    summary_prompt = f.read()
+
+with open(cfg.get("FUNC_CLS_EXCTRACTION_PROMPT_PATH"), encoding="utf-8") as f:
+    func_cls_extract_prompt = f.read()
+
+client = OpenAI(api_key=cfg.get("OPENAI_API_KEY"))
+
+
+def parallel_summarize(meta_dict, max_workers=5):
+    items = list(meta_dict.values())
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(summarize_hierarchy, item): item for item in items}
+
+        for _ in tqdm(as_completed(futures), total=len(futures)):
+            pass  # progress bar only
+
+    return meta_dict
+
+
+def summarize_text(text: str) -> str:
+    prompt = summary_prompt.replace("{{CONTENT}}", text)
+
+    response = client.chat.completions.create(
+        model=cfg.get("OPENAI_SUMMARISATION_MODEL"),
+        messages=[{"role": "user", "content": prompt}]
+    )
+
+    return response.choices[0].message.content.strip()
+
+
+def summarize_hierarchy(node):
+    """
+    node can be:
+    - a leaf: library without subpages or a subpage
+    - a library with subpages
+    """
+
+    # Leaf node
+    # either supbages key does not exist or its value is an empty dict
+    if not node.get("subpages"):
+        node["description"] = summarize_text(node["content"])
+        return node["description"]
+
+    # Non-leaf: summarize subpages first
+    subpages_summaries = []
+    for subpage_title, subpage_meta in node.get("subpages").items():
+        summary = summarize_hierarchy(subpage_meta)
+        subpages_summaries.append(f"{subpage_title}: {summary}")
+
+    combined = "\n".join(subpages_summaries)
+    node["description"] = summarize_text(combined)
+
+    return node["description"]
+
+
+def save_json(d: dict, output_path: str):
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(d, f, indent=2, ensure_ascii=False)
+
+    print(f'Saved output to {output_path}')
+
+
+def build_library_content(library_meta):
+    parts = []
+
+    # README content
+    if "content" in library_meta:
+        parts.append(library_meta["content"])
+
+    # Subpages
+    if "subpages" in library_meta:
+        for sub in library_meta["subpages"].values():
+            if "content" in sub:
+                parts.append(sub["content"])
+
+    return "\n\n".join(parts)
+
+
+def extract_api_items(library_meta: dict):
+    full_content = build_library_content(library_meta)
+
+    prompt = func_cls_extract_prompt.replace("{{CONTENT}}", full_content)
+
+    response = client.chat.completions.create(
+        model=cfg.get("OPENAI_FUNC_CLS_EXTRACTION_MODEL"),
+        messages=[{"role": "user", "content": prompt}]
+    )
+
+    text = response.choices[0].message.content.strip()
+
+    # Parse JSON safely
+    try:
+        data = json.loads(text)
+        library_meta["notable_functions_and_classes"] = data
+    except Exception:
+        print("Model returned invalid JSON, raw output:")
+        print(text)
+        library_meta["notable_functions_and_classes"] = []
+
+
+def parallel_func_cls_extraction(meta_dict, max_workers=5):
+    items = list(meta_dict.values())
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {executor.submit(extract_api_items, item): item for item in items}
+
+        for _ in tqdm(as_completed(futures), total=len(futures)):
+            pass  # progress bar only
+
+    return meta_dict
+
+
+def main():
+    print('\nReading basic meta file...')
+    with open(cfg.get("BASIC_OUTPUT_PATH"), encoding="utf-8") as f:
+       basic_meta = json.load(f)
+    
+    print('\nGenerating summaries...')
+    parallel_summarize(basic_meta)
+    save_json(basic_meta, cfg.get("BASIC_OUTPUT_PATH"))
+
+    print('\nReading libraries meta file...')
+    with open(cfg.get("LIBRARIES_OUTPUT_PATH"), encoding="utf-8") as f:
+        libs_meta = json.load(f)
+
+    print('\nGenerating summaries...')
+    parallel_summarize(libs_meta)
+    print('\nExtract notable functions and classes...')
+    parallel_func_cls_extraction(libs_meta)
+    save_json(libs_meta, cfg.get("LIBRARIES_OUTPUT_PATH"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data/summary_gen_prompt.txt b/data/summary_gen_prompt.txt
new file mode 100644
index 0000000..b7267b1
--- /dev/null
+++ b/data/summary_gen_prompt.txt
@@ -0,0 +1,11 @@
+You are summarizing documentation content to create a compact, embedding-friendly description.
+
+Your goal:
+- Extract the core concepts.
+- Produce a concise, information-dense description suitable for embeddings and AI agents.
+- Do NOT rewrite the content; abstract it in a 1-3 sentences, max 50 words.
+- Focus on what the component *is* and *does*.
+- If the content is empty, then return the word 'EMPTY' only
+
+Content to summarize:
+{{CONTENT}}

From 014bf3d386cd5784edc8bfc6c77460e40664bba7 Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 14 Jan 2026 14:28:02 +0300
Subject: [PATCH 6/7] create vector databases from scraped information

---
 data/create_vdbs.py | 255 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 data/create_vdbs.py

diff --git a/data/create_vdbs.py b/data/create_vdbs.py
new file mode 100644
index 0000000..8b4ddbd
--- /dev/null
+++ b/data/create_vdbs.py
@@ -0,0 +1,255 @@
+import os
+import json
+from pprint import pprint
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import faiss
+import numpy as np
+from tqdm import tqdm
+from openai import OpenAI
+
+
+with open("./config.json", encoding="utf-8") as f:
+    cfg = json.load(f)
+
+client = OpenAI(api_key=cfg.get("OPENAI_API_KEY"))
+
+
+def embed_texts(texts):
+    response = client.embeddings.create(
+        model=cfg.get("EMBEDDING_MODEL"),
+        input=texts
+    )
+    return [item.embedding for item in response.data]
+
+
+def build_faiss_index(vectors, output_path):
+    dim = len(vectors[0])
+    index = faiss.IndexFlatL2(dim)
+
+    vectors_np = np.array(vectors).astype("float32")
+    index.add(vectors_np)
+
+    faiss.write_index(index, output_path)
+
+
+def build_basic_vector_db(basic_meta, output_dir):
+    descriptions = []
+    mapping = []  # list of keys
+
+    for key, item in tqdm(basic_meta.items()):
+        desc = item.get("description")
+        if not desc:
+            continue
+
+        descriptions.append(desc)
+        mapping.append(key)
+
+    if not descriptions:
+        return
+
+    vectors = embed_texts(descriptions)
+
+    os.makedirs(output_dir, exist_ok=True)
+    build_faiss_index(vectors, os.path.join(output_dir, "basic.index"))
+
+    with open(os.path.join(output_dir, "basic.mapping.json"), "w", encoding="utf-8") as f:
+        json.dump(mapping, f, ensure_ascii=False, indent=2)
+
+
+def build_library_highlevel_db(libs_meta, output_dir):
+    descriptions = []
+    mapping = []
+
+    for lib_name, lib in tqdm(libs_meta.items()):
+        desc = lib.get("description")
+        if not desc:
+            continue
+
+        descriptions.append(desc)
+        mapping.append(lib_name)
+
+    if not descriptions:
+        return
+
+    vectors = embed_texts(descriptions)
+
+    os.makedirs(output_dir, exist_ok=True)
+    build_faiss_index(vectors, os.path.join(output_dir, "libraries.index"))
+
+    with open(os.path.join(output_dir, "libraries.mapping.json"), "w", encoding="utf-8") as f:
+        json.dump(mapping, f, ensure_ascii=False, indent=2)
+
+
+def build_library_subpage_dbs(libs_meta, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+
+    for lib_name, lib in tqdm(libs_meta.items()):
+        subpages = lib.get("subpages", {})
+        if not subpages:
+            continue  # skip libraries without subpages
+
+        descriptions = []
+        mapping = []  # (library_name, subpage_name)
+
+        for page_name, page in subpages.items():
+            desc = page.get("description")
+            if desc:
+                descriptions.append(desc)
+                mapping.append((lib_name, page_name))
+
+        if not descriptions:
+            continue
+
+        vectors = embed_texts(descriptions)
+
+        index_path = os.path.join(output_dir, f"{lib_name}.subpages.index")
+        build_faiss_index(vectors, index_path)
+
+        with open(os.path.join(output_dir, f"{lib_name}.subpages.mapping.json"), "w", encoding="utf-8") as f:
+            json.dump(mapping, f, ensure_ascii=False, indent=2)
+
+
+def retrieve_basic(query, basic_meta, output_dir, k=5):
+    index = faiss.read_index(os.path.join(output_dir, "basic.index"))
+
+    with open(os.path.join(output_dir, "basic.mapping.json"), encoding="utf-8") as f:
+        mapping = json.load(f)
+
+    vector = embed_texts([query])[0]
+    vector_np = np.array([vector]).astype("float32")
+
+    distances, indices = index.search(vector_np, k)
+
+    results = []
+    for idx in indices[0]:
+        key = mapping[idx]
+        results.append((key, basic_meta[key]))
+
+    return results
+
+
+def retrieve_library_highlevel(query, libs_meta, output_dir, k=5):
+    index = faiss.read_index(os.path.join(output_dir, "libraries.index"))
+
+    with open(os.path.join(output_dir, "libraries.mapping.json"), encoding="utf-8") as f:
+        mapping = json.load(f)
+
+    vector = embed_texts([query])[0]
+    vector_np = np.array([vector]).astype("float32")
+
+    distances, indices = index.search(vector_np, k)
+
+    results = []
+    for idx in indices[0]:
+        lib_name = mapping[idx]
+        results.append(libs_meta[lib_name])
+
+    return results
+
+
+def retrieve_library_subpages(query, lib_name, libs_meta, output_dir, k=5):
+    index = faiss.read_index(os.path.join(output_dir, f"{lib_name}.subpages.index"))
+
+    with open(os.path.join(output_dir, f"{lib_name}.subpages.mapping.json"), encoding="utf-8") as f:
+        mapping = json.load(f)
+
+    vector = embed_texts([query])[0]
+    vector_np = np.array([vector]).astype("float32")
+
+    distances, indices = index.search(vector_np, k)
+
+    results = []
+    for idx in indices[0]:
+        lib, page = mapping[idx]
+        results.append((page, libs_meta[lib]["subpages"][page]))
+
+    return results
+
+
+def build_all_vector_dbs(basic_meta: dict, libs_meta: dict):
+    vdbs_output_dir = cfg.get("VDBS_OUTPUT_DIR")
+
+    print("Building basic vector DB...")
+    build_basic_vector_db(basic_meta, vdbs_output_dir)
+    print("Building library high-level vector DB...")
+    build_library_highlevel_db(libs_meta, vdbs_output_dir)
+    print("Building per-library subpage vector DBs...")
+    build_library_subpage_dbs(libs_meta, vdbs_output_dir)
+
+    print("Build complete.")
+
+
+if __name__ == "__main__":
+    import logging
+
+    LOG_PATH = "vector_db_test.log"
+
+    logging.basicConfig(
+        filename=LOG_PATH,
+        filemode="w",
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s"
+    )
+
+    with open(cfg.get("BASIC_OUTPUT_PATH"), encoding="utf-8") as f:
+        basic_meta = json.load(f)
+
+    with open(cfg.get("LIBRARIES_OUTPUT_PATH"), encoding="utf-8") as f:
+        libs_meta = json.load(f)
+
+    build_all_vector_dbs(basic_meta, libs_meta)
+
+    # ---------------------------------------------------------
+    # TESTS
+    # ---------------------------------------------------------
+    logging.info("=== Starting Vector DB Retrieval Tests ===")
+
+    # -----------------------------
+    # Test 1: Basic Docs Retrieval
+    # -----------------------------
+    query1 = "function"
+    results1 = retrieve_basic(query1, basic_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3)
+
+    logging.info("=== Test 1: Basic Docs Retrieval ===")
+    logging.info(f"Query: {query1}")
+
+    for i, (key, item) in enumerate(results1, 1):
+        logging.info(f"\nResult {i}:")
+        logging.info(f"Key: {key}")
+        logging.info(f"Description: {item.get('description')}")
+
+    # -----------------------------
+    # Test 2: Library High-Level Retrieval
+    # -----------------------------
+    query2 = "How to handle Google authentication in Alusus?"
+    results2 = retrieve_library_highlevel(query2, libs_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3)
+
+    logging.info("\n=== Test 2: Library High-Level Retrieval ===")
+    logging.info(f"Query: {query2}")
+
+    for i, item in enumerate(results2, 1):
+        logging.info(f"\nResult {i}:")
+        logging.info(f"Library: {item.get('name')}")
+        logging.info(f"Description: {item.get('description')}")
+
+    # -----------------------------
+    # Test 3: Subpage Retrieval for a Specific Library
+    # -----------------------------
+    TEST_LIBRARY = "WebPlatform"
+
+    if TEST_LIBRARY in libs_meta and libs_meta[TEST_LIBRARY].get("subpages"):
+        query3 = "How to create a webpage with a picture in Alusus?"
+        results3 = retrieve_library_subpages(query3, TEST_LIBRARY, libs_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3)
+
+        logging.info(f"\n=== Test 3: Subpage Retrieval for '{TEST_LIBRARY}' ===")
+        logging.info(f"Query: {query3}")
+
+        for i, (page, item) in enumerate(results3, 1):
+            logging.info(f"\nResult {i}:")
+            logging.info(f"Subpage: {page}")
+            logging.info(f"Description: {item.get('description')}")
+    else:
+        logging.info(f"\nSkipping Test 3: Library '{TEST_LIBRARY}' has no subpages.")
+
+    logging.info("=== Vector DB Retrieval Tests Completed ===")

From cefaeeb8e32ae9657434cd11512a61e9d9863de0 Mon Sep 17 00:00:00 2001
From: KaisHasan <kais.hasan314@gmail.com>
Date: Wed, 14 Jan 2026 17:03:18 +0300
Subject: [PATCH 7/7] add readme.md file with instructions on how to run the
 code

---
 data/readme.md | 164 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 data/readme.md

diff --git a/data/readme.md b/data/readme.md
new file mode 100644
index 0000000..d339f79
--- /dev/null
+++ b/data/readme.md
@@ -0,0 +1,164 @@
+# 📘 Documentation Scraper & Vector Database Builder
+
+This folder contains the full pipeline for:
+
+- Scraping documentation from the Alusus website and GitHub repos
+- Extracting metadata for basic docs and libraries  
+- Generating descriptions using OpenAI models
+- Extracting notable functions and classes for each library using OpenAI models.
+- Creating FAISS vector databases for retrieval  
+
+The pipeline consists of three main scripts:
+
+1. `scraper.py`  
+2. `generate_description.py`  
+3. `create_vdbs.py`  
+
+Each script depends on a shared `config.json` file.
+
+---
+
+## 📦 Requirements
+
+Before running the pipeline, ensure that:
+
+- You have installed the full dependencies of the main repository  
+- Additionally, **BeautifulSoup (bs4)** must be installed:
+
+```bash
+pip install beautifulsoup4
+```
+
+or
+
+```bash
+conda install conda-forge::beautifulsoup4
+```
+
+This is required for HTML parsing during documentation scraping.
+
+---
+
+## ⚙️ Configuration (`config.json`)
+
+You must create a `config.json` file in this folder before running any script.
+
+Use the following structure:
+
+```json
+{
+    "WEBSITE_DOCS_URLS": [
+        "https://alusus.org/Documents/lang-reference.en.html",
+        "https://alusus.org/Documents/srt-reference.en.html"
+    ],
+    "BASIC_OUTPUT_PATH": "./basic_meta.json",
+    "LIBRARIES_META_URL": "https://alusus.org/Releases/libraries.json",
+    "LIBRARIES_OUTPUT_PATH": "./libs_meta.json",
+    "GITHUB_TOKEN": "...",
+    "OPENAI_API_KEY": "...",
+    "OPENAI_SUMMARISATION_MODEL": "gpt-4o-mini",
+    "SUMMARY_PROMPT_PATH": "./summary_gen_prompt.txt",
+    "OPENAI_FUNC_CLS_EXTRACTION_MODEL": "gpt-5.2-chat-latest",
+    "FUNC_CLS_EXCTRACTION_PROMPT_PATH": "./extract_func_cls_prompt.txt",
+    "EMBEDDING_MODEL": "text-embedding-3-large",
+    "VDBS_OUTPUT_DIR": "./vdbs/"
+}
+```
+
+### 🔑 Required Keys
+
+- **GITHUB_TOKEN**  
+  Used to query GitHub’s API for default branch names without hitting rate limits.
+
+- **OPENAI_API_KEY**  
+  Required for generating summaries, extracting function/class metadata, and creating embeddings.
+
+---
+
+## 🔐 Creating a GitHub Token
+
+GitHub requires authentication for higher API rate limits.  
+To create a token:
+
+1. Visit:  
+   `https://github.com/settings/tokens?type=beta`
+2. Click **“Generate new token”**
+3. Choose **Fine-grained token**
+4. Give it a name (e.g., `alusus-docs-scraper`)
+5. Under **Repository access**, choose:
+   - **Public repositories** → Read-only
+6. Generate the token
+7. Copy it and paste it into your `config.json` under `"GITHUB_TOKEN"`
+
+This token allows the scraper to query repository metadata without hitting the unauthenticated rate limit.
+
+---
+
+## ▶️ Running the Pipeline
+
+Before running the scripts, activate the full environment of the main repository.
+
+Then navigate to this folder and run the scripts in order:
+
+### 1. Scrape documentation and library metadata
+
+```bash
+python scraper.py
+```
+
+This generates:
+
+- `basic_meta.json`  
+- `libs_meta.json`  
+
+---
+
+### 2. Generate descriptions using OpenAI
+
+```bash
+python generate_description.py
+```
+
+This enriches the metadata with:
+
+- Summaries  
+- Function/class descriptions  
+- Cleaned content  
+
+---
+
+### 3. Create vector databases (FAISS)
+
+```bash
+python create_vdbs.py
+```
+
+This produces:
+
+- A vector DB for basic docs  
+- A vector DB for library‑level descriptions  
+- A vector DB for each library with subpages  
+
+All stored under the directory defined in:
+
+```json
+"VDBS_OUTPUT_DIR": "./vdbs/"
+```
+
+---
+
+## 📁 Output Structure
+
+After running all scripts, you will have:
+
+```
+basic_meta.json
+libs_meta.json
+vdbs/
+    basic.index
+    basic.mapping.json
+    libraries.index
+    libraries.mapping.json
+    <libname>.subpages.index
+    <libname>.subpages.mapping.json
+```