From c6b70103924f8cad2aa267314aa50b562b00376b Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 8 Oct 2025 13:59:07 +0300 Subject: [PATCH 1/7] add python cache folders to the git ignore list --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 53efcb5..da683f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .apm Build/ +__pycache__ From ccae913357e439f6b1d91b3e89ab39328710a6f7 Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 8 Oct 2025 13:59:39 +0300 Subject: [PATCH 2/7] write the mcp server code in flask --- mcp/__init__.py | 0 mcp/app.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ mcp/tools.json | 17 ++++++++ 3 files changed, 131 insertions(+) create mode 100644 mcp/__init__.py create mode 100644 mcp/app.py create mode 100644 mcp/tools.json diff --git a/mcp/__init__.py b/mcp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcp/app.py b/mcp/app.py new file mode 100644 index 0000000..6644e92 --- /dev/null +++ b/mcp/app.py @@ -0,0 +1,114 @@ +import json + +from flask import Flask, request, jsonify + +from copilot_server.agents import BasicExamplesRetriever + +app = Flask(__name__) + +global_id = 0 +BASIC_VDB_PATH = 'vdb_basic.index' +BASIC_EXAMPLES_INDEX_PATH = 'index.json' + +basic_examples_retriever = BasicExamplesRetriever(BASIC_VDB_PATH, BASIC_EXAMPLES_INDEX_PATH, debug=True) + + +with open("mcp/tools.json", "r", encoding="utf-8") as tools_file: + TOOLS = json.load(tools_file) + + +def initialize_mcp(id): + return jsonify({ + "jsonrpc": "2.0", + "id": id, + "result": { + "protocolVersion": "2025-06-18", + "serverInfo": {"name": "alusus-docs", "version": "1.0.0"}, + "capabilities": {"tools": {"listChanged": True}, "resources": {}} + } + }) + + +def retrieve_docs(query): + docs = basic_examples_retriever.get_docs(query, k=10) + + return docs + + +def get_tool(tool_name): + if tool_name == 'alusus_docs_retrieve': + return retrieve_docs + else: + return None + + +@app.route('/mcp', methods=['POST']) +def mcp(): + payload = request.get_json(force=True) + print(f'payload: {payload}') + + jsonrpc = payload.get("jsonrpc") + method = payload.get("method") + id_ = payload.get("id") + + if method == 'initialize': + return initialize_mcp(id_) + + if method == 'notifications/initialized': + return jsonify({ + "jsonrpc": "2.0", + "method": "notifications/initialized", + "params": {} + }, 200) + + if method == 'tools/list': + return jsonify({ + "jsonrpc": "2.0", + "id": id_, + "result": {"tools": TOOLS} + }) + + if method == 'ping': + return jsonify({ + "jsonrpc": "2.0", + "id": id_, + "result": {} + }) + + if method == 'tools/call': + params = payload.get("params") + if params is not None: + tool_name = params.get("name") + args = params.get("arguments") + + tool = get_tool(tool_name) + + if tool is None: + return ("Bad Request, invalid tool name", 400) + + results = tool(**args) + + content = [{"type": "text", "text": item} for item in results] + + return jsonify({ + "jsonrpc": "2.0", + "id": id_, + "result": { + "content": content + } + }) + else: + return ("Bad Request, include required parameters to invoke the tool", 400) + + +@app.route('/query', methods=['POST']) +def handle_query(): + data = request.json + + print(f'data:\n {data}') + + return jsonify({"response": "message reached the server!!"}) + + +if __name__ == '__main__': + app.run(debug=True) diff --git a/mcp/tools.json b/mcp/tools.json new file mode 100644 index 0000000..53a6859 --- /dev/null +++ b/mcp/tools.json @@ -0,0 +1,17 @@ +[ + { + "name": "alusus_docs_retrieve", + "title": "Alusus Docs", + "description": "Retrieve documentation for a given query about Alusus programming language", + "inputSchema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The user question about Alusus programming language" + } + }, + "required": ["query"] + } + } +] From 13dbd8d8f67f581e19fd4365636294746a6b6bb9 Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 8 Oct 2025 14:00:38 +0300 Subject: [PATCH 3/7] configure opencode to use the mcp test server --- opencode/config.json | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 opencode/config.json diff --git a/opencode/config.json b/opencode/config.json new file mode 100644 index 0000000..53172f5 --- /dev/null +++ b/opencode/config.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://opencode.ai/config.json", + "agent": { + "alusus-server": { + "mode": "subagent", + "description": "Posts user input to my HTTP endpoint and returns the JSON response", + "model": "openai/o4-mini", + "prompt": "Take the user’s question, POST `{ \"input\": \"\" }` to http://127.0.0.1:5000/query , then return the parsed JSON." + }, + + "alusus-simple": { + "mode": "subagent", + "description": "Clean the user query and retrieve docs from the correct mcp", + "prompt": "When the user asks about the Alusus programming language, Clean the user’s question by removing filler words, trailing punctuation, and convert it to a concise form. Then take it and, POST `{ \"query\": \"\" }` to http://127.0.0.1:5000/retrieve-docs. Once the tool returns, compose a final answer grounded in the returned 'docs' field.", + "model": "openai/o4-mini" + }, + + "alusus-docs": { + "mode": "subagent", + "description": "Posts user input to my mcp endpoint and returns the JSON response", + "model": "openai/o4-mini", + "prompt": "Use the mcp \"alusus-docs\" to retrieve docs through the tool `alusus_docs_retrieve`, the mcp use http, so do the initialization and all steps required for that." + }, + }, + + + "mcp": { + "alusus-docs": { + "type": "remote", + "url": "http://127.0.0.1:5000/mcp", + "enabled": true + } + }, + + "tools": { + // Enable all tools from the docs server + "alusus-docs_*": true + } +} From 13946cbbe499957b8e3dd6f94332e079e93f5c7e Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 14 Jan 2026 14:22:07 +0300 Subject: [PATCH 4/7] implement a scraper for the website and github repos of Alusus --- data/scraper.py | 248 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 data/scraper.py diff --git a/data/scraper.py b/data/scraper.py new file mode 100644 index 0000000..87da1cf --- /dev/null +++ b/data/scraper.py @@ -0,0 +1,248 @@ +import re +import json +import requests +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, as_completed + +from bs4.element import Tag +from bs4 import BeautifulSoup + + +with open("./config.json", encoding="utf-8") as f: + cfg = json.load(f) + + +def download_json(url: str) -> dict: + if not url: + return None + + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def fetch_text(url: str) -> str: + try: + r = requests.get(url) + if r.status_code == 200: + return r.text + except Exception: + pass + return None + + +def get_default_branch(repo_url: str) -> str: + # Extract owner and repo from URL + match = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url) + if not match: + raise ValueError("Invalid GitHub repository URL") + + owner, repo = match.group(1), match.group(2) + + api_url = f"https://api.github.com/repos/{owner}/{repo}" + + headers = {"Accept": "application/vnd.github+json"} + token = cfg.get("GITHUB_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + + response = requests.get(api_url, headers=headers) + + if response.status_code != 200: + raise RuntimeError(f"GitHub API error: {response.status_code} - {response.text}") + + data = response.json() + return data.get("default_branch", "main") # Fallback to `main` + + +def path_to_raw_url(path: str, repo_url: str) -> str: + default_branch = get_default_branch(repo_url) + view_url = repo_url.rstrip("/") + f"/blob/{default_branch}/" + path.lstrip("/") + raw_url = view_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/") + + return raw_url + + +def get_readme_url(repo_url: str) -> str: + return path_to_raw_url("/readme.md", repo_url) + + +def extract_subpage_links(md_text: str) -> list[tuple[str, str]]: + pattern = r"\[\[([^\]]+)\]\]\(([^)]+)\)" + matches = re.findall(pattern, md_text) + return matches + + +def process_library(item: dict) -> dict: + lib_name = item.get("name") + repo_url = item.get("url") + + lib_meta = { + "name": lib_name, + "repo_url": repo_url, + "description": "", + "content": None, + "subpages": {} + } + + if not repo_url: + return lib_meta + + # Find README + readme_url = get_readme_url(repo_url) + if not readme_url: + return lib_meta + + readme_text = fetch_text(readme_url) + lib_meta["content"] = readme_text + + if not readme_text: + return lib_meta + + # Extract subpages + subpages = extract_subpage_links(readme_text) + + for link_text, sub_url in subpages: + if sub_url == "readme.ar.md": + continue + + # Convert relative links to absolute + if not sub_url.startswith("http"): + sub_url = path_to_raw_url(sub_url, repo_url) + + content = fetch_text(sub_url) + if content: + lib_meta["subpages"][link_text] = { + "url": sub_url, + "description": "", + "content": content, + } + + return lib_meta + + +def is_inside(el, container): + parent = el + while parent: + if parent is container: + return True + parent = parent.parent + return False + + +def process_webpage_doc(url: str) -> dict: + html = fetch_text(url) + soup = BeautifulSoup(html, "html.parser") + + sidebar = soup.select_one(".col-md-3 .card.side-sticky .card-content") + if not sidebar: + raise ValueError("Sidebar not found in the HTML page") + + # Extract ordered list of (id, title) from sidebar + nav_items = [] + for a in sidebar.select("a[href^='#']"): + href = a.get("href") + if href and href.startswith("#"): + section_id = href[1:] # remove '#' + title = a.get_text(strip=True) + nav_items.append((section_id, title)) + + if not nav_items: + return {} + + content_container = soup.select_one(".col-md-9") or soup.body + + sections = {} + for i, (section_id, title) in enumerate(nav_items): + start_el = soup.find(id=section_id) + if not start_el: + continue + + # Determine the next section ID (if any) + next_id = nav_items[i + 1][0] if i + 1 < len(nav_items) else None + + content_parts = [] + started = False + + # Walk the DOM in document order, including text nodes + for node in start_el.next_elements: + # Skip the starting element itself + if not started: + started = True + continue + + # If we left the content container, stop + if isinstance(node, Tag) and not is_inside(node, content_container): + break + + # Stop when we reach the next section anchor + if isinstance(node, Tag) and next_id and node.get("id") == next_id: + break + + # if this node contains the next node, skip + if isinstance(node, Tag) and next_id: + inner_next = node.find(id=next_id) + if inner_next is not None: + continue + + content_parts.append(str(node)) + + sections[title] = { + "content": "".join(content_parts), + "description": "", # to be filled later using AI models + } + + return sections + + +def parallel_process_libraries(max_workers=5) -> dict: + print("\nDownloading libraries info...") + libraries = download_json(cfg.get("LIBRARIES_META_URL")) + print("\nDone Downloading libraries info...") + + print("\nExtracting libraries meta information...") + def process(lib_info): + lib_name = lib_info.get("name") + data = process_library(lib_info) + return lib_name, data + + libs_meta = {} + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(process, lib_info) for lib_info in libraries] + + for future in tqdm(as_completed(futures), total=len(futures)): + lib_name, data = future.result() + libs_meta[lib_name] = data # safe: done in main thread only + + return libs_meta + + +def process_basic() -> dict: + basic_meta = {} + + print("\nExtracting website docs...") + for doc_url in tqdm(cfg.get("WEBSITE_DOCS_URLS")): + doc_sections = process_webpage_doc(doc_url) + basic_meta.update(doc_sections) + + return basic_meta + + +def save_json(d: dict, output_path: str): + with open(output_path, "w", encoding="utf-8") as f: + json.dump(d, f, indent=2, ensure_ascii=False) + + print(f'Saved output to {output_path}') + + +def main(): + libs_meta = parallel_process_libraries() + save_json(libs_meta, cfg.get("LIBRARIES_OUTPUT_PATH")) + + basic_meta = process_basic() + save_json(basic_meta, cfg.get("BASIC_OUTPUT_PATH")) + + +if __name__ == "__main__": + main() From c8cb2a3c3c4b1b17a446028cb145a26eb2df6dae Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 14 Jan 2026 14:27:21 +0300 Subject: [PATCH 5/7] add description to scraped data via AI models --- data/extract_func_cls_prompt.txt | 33 +++++++ data/generate_description.py | 147 +++++++++++++++++++++++++++++++ data/summary_gen_prompt.txt | 11 +++ 3 files changed, 191 insertions(+) create mode 100644 data/extract_func_cls_prompt.txt create mode 100644 data/generate_description.py create mode 100644 data/summary_gen_prompt.txt diff --git a/data/extract_func_cls_prompt.txt b/data/extract_func_cls_prompt.txt new file mode 100644 index 0000000..821e00c --- /dev/null +++ b/data/extract_func_cls_prompt.txt @@ -0,0 +1,33 @@ +You are analyzing documentation for a software library. Your task is to extract all notable +functions and classes described in the content. + +A "notable" function or class is one that: +- is explicitly documented, OR +- appears in code blocks, OR +- is described as part of the library’s API. + +For each item, extract: +- "type": either "function" or "class" +- "description": a single concise sentence describing what it does +- "declaration": the function signature or class declaration (with constructors) exactly as shown in the docs + +Rules: +- Output ONLY valid JSON. +- Do not add markdown such as ```json```. +- Do not invent APIs that are not mentioned. +- If the declaration is incomplete in the docs, extract the best available form. +- Ignore examples that do not define new functions/classes. +- For class declaration, the decalaration of the various constructors should be preseneted, not just the class name. + +Content: +{{CONTENT}} + +Return JSON in the following format: + +[ + { + "type": "function", + "description": "...", + "declaration": "..." + } +] diff --git a/data/generate_description.py b/data/generate_description.py new file mode 100644 index 0000000..0346098 --- /dev/null +++ b/data/generate_description.py @@ -0,0 +1,147 @@ +import json +from concurrent.futures import ThreadPoolExecutor, as_completed +from pprint import pprint + +from openai import OpenAI +from tqdm import tqdm + + +with open("./config.json", encoding="utf-8") as f: + cfg = json.load(f) + +with open(cfg.get("SUMMARY_PROMPT_PATH"), encoding="utf-8") as f: + summary_prompt = f.read() + +with open(cfg.get("FUNC_CLS_EXCTRACTION_PROMPT_PATH"), encoding="utf-8") as f: + func_cls_extract_prompt = f.read() + +client = OpenAI(api_key=cfg.get("OPENAI_API_KEY")) + + +def parallel_summarize(meta_dict, max_workers=5): + items = list(meta_dict.values()) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(summarize_hierarchy, item): item for item in items} + + for _ in tqdm(as_completed(futures), total=len(futures)): + pass # progress bar only + + return meta_dict + + +def summarize_text(text: str) -> str: + prompt = summary_prompt.replace("{{CONTENT}}", text) + + response = client.chat.completions.create( + model=cfg.get("OPENAI_SUMMARISATION_MODEL"), + messages=[{"role": "user", "content": prompt}] + ) + + return response.choices[0].message.content.strip() + + +def summarize_hierarchy(node): + """ + node can be: + - a leaf: library without subpages or a subpage + - a library with subpages + """ + + # Leaf node + # either supbages key does not exist or its value is an empty dict + if not node.get("subpages"): + node["description"] = summarize_text(node["content"]) + return node["description"] + + # Non-leaf: summarize subpages first + subpages_summaries = [] + for subpage_title, subpage_meta in node.get("subpages").items(): + summary = summarize_hierarchy(subpage_meta) + subpages_summaries.append(f"{subpage_title}: {summary}") + + combined = "\n".join(subpages_summaries) + node["description"] = summarize_text(combined) + + return node["description"] + + +def save_json(d: dict, output_path: str): + with open(output_path, "w", encoding="utf-8") as f: + json.dump(d, f, indent=2, ensure_ascii=False) + + print(f'Saved output to {output_path}') + + +def build_library_content(library_meta): + parts = [] + + # README content + if "content" in library_meta: + parts.append(library_meta["content"]) + + # Subpages + if "subpages" in library_meta: + for sub in library_meta["subpages"].values(): + if "content" in sub: + parts.append(sub["content"]) + + return "\n\n".join(parts) + + +def extract_api_items(library_meta: dict): + full_content = build_library_content(library_meta) + + prompt = func_cls_extract_prompt.replace("{{CONTENT}}", full_content) + + response = client.chat.completions.create( + model=cfg.get("OPENAI_FUNC_CLS_EXTRACTION_MODEL"), + messages=[{"role": "user", "content": prompt}] + ) + + text = response.choices[0].message.content.strip() + + # Parse JSON safely + try: + data = json.loads(text) + library_meta["notable_functions_and_classes"] = data + except Exception: + print("Model returned invalid JSON, raw output:") + print(text) + library_meta["notable_functions_and_classes"] = [] + + +def parallel_func_cls_extraction(meta_dict, max_workers=5): + items = list(meta_dict.values()) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(extract_api_items, item): item for item in items} + + for _ in tqdm(as_completed(futures), total=len(futures)): + pass # progress bar only + + return meta_dict + + +def main(): + print('\nReading basic meta file...') + with open(cfg.get("BASIC_OUTPUT_PATH"), encoding="utf-8") as f: + basic_meta = json.load(f) + + print('\nGenerating summaries...') + parallel_summarize(basic_meta) + save_json(basic_meta, cfg.get("BASIC_OUTPUT_PATH")) + + print('\nReading libraries meta file...') + with open(cfg.get("LIBRARIES_OUTPUT_PATH"), encoding="utf-8") as f: + libs_meta = json.load(f) + + print('\nGenerating summaries...') + parallel_summarize(libs_meta) + print('\nExtract notable functions and classes...') + parallel_func_cls_extraction(libs_meta) + save_json(libs_meta, cfg.get("LIBRARIES_OUTPUT_PATH")) + + +if __name__ == "__main__": + main() diff --git a/data/summary_gen_prompt.txt b/data/summary_gen_prompt.txt new file mode 100644 index 0000000..b7267b1 --- /dev/null +++ b/data/summary_gen_prompt.txt @@ -0,0 +1,11 @@ +You are summarizing documentation content to create a compact, embedding-friendly description. + +Your goal: +- Extract the core concepts. +- Produce a concise, information-dense description suitable for embeddings and AI agents. +- Do NOT rewrite the content; abstract it in a 1-3 sentences, max 50 words. +- Focus on what the component *is* and *does*. +- If the content is empty, then return the word 'EMPTY' only + +Content to summarize: +{{CONTENT}} From 014bf3d386cd5784edc8bfc6c77460e40664bba7 Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 14 Jan 2026 14:28:02 +0300 Subject: [PATCH 6/7] create vector databases from scraped information --- data/create_vdbs.py | 255 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 data/create_vdbs.py diff --git a/data/create_vdbs.py b/data/create_vdbs.py new file mode 100644 index 0000000..8b4ddbd --- /dev/null +++ b/data/create_vdbs.py @@ -0,0 +1,255 @@ +import os +import json +from pprint import pprint +from concurrent.futures import ThreadPoolExecutor, as_completed + +import faiss +import numpy as np +from tqdm import tqdm +from openai import OpenAI + + +with open("./config.json", encoding="utf-8") as f: + cfg = json.load(f) + +client = OpenAI(api_key=cfg.get("OPENAI_API_KEY")) + + +def embed_texts(texts): + response = client.embeddings.create( + model=cfg.get("EMBEDDING_MODEL"), + input=texts + ) + return [item.embedding for item in response.data] + + +def build_faiss_index(vectors, output_path): + dim = len(vectors[0]) + index = faiss.IndexFlatL2(dim) + + vectors_np = np.array(vectors).astype("float32") + index.add(vectors_np) + + faiss.write_index(index, output_path) + + +def build_basic_vector_db(basic_meta, output_dir): + descriptions = [] + mapping = [] # list of keys + + for key, item in tqdm(basic_meta.items()): + desc = item.get("description") + if not desc: + continue + + descriptions.append(desc) + mapping.append(key) + + if not descriptions: + return + + vectors = embed_texts(descriptions) + + os.makedirs(output_dir, exist_ok=True) + build_faiss_index(vectors, os.path.join(output_dir, "basic.index")) + + with open(os.path.join(output_dir, "basic.mapping.json"), "w", encoding="utf-8") as f: + json.dump(mapping, f, ensure_ascii=False, indent=2) + + +def build_library_highlevel_db(libs_meta, output_dir): + descriptions = [] + mapping = [] + + for lib_name, lib in tqdm(libs_meta.items()): + desc = lib.get("description") + if not desc: + continue + + descriptions.append(desc) + mapping.append(lib_name) + + if not descriptions: + return + + vectors = embed_texts(descriptions) + + os.makedirs(output_dir, exist_ok=True) + build_faiss_index(vectors, os.path.join(output_dir, "libraries.index")) + + with open(os.path.join(output_dir, "libraries.mapping.json"), "w", encoding="utf-8") as f: + json.dump(mapping, f, ensure_ascii=False, indent=2) + + +def build_library_subpage_dbs(libs_meta, output_dir): + os.makedirs(output_dir, exist_ok=True) + + for lib_name, lib in tqdm(libs_meta.items()): + subpages = lib.get("subpages", {}) + if not subpages: + continue # skip libraries without subpages + + descriptions = [] + mapping = [] # (library_name, subpage_name) + + for page_name, page in subpages.items(): + desc = page.get("description") + if desc: + descriptions.append(desc) + mapping.append((lib_name, page_name)) + + if not descriptions: + continue + + vectors = embed_texts(descriptions) + + index_path = os.path.join(output_dir, f"{lib_name}.subpages.index") + build_faiss_index(vectors, index_path) + + with open(os.path.join(output_dir, f"{lib_name}.subpages.mapping.json"), "w", encoding="utf-8") as f: + json.dump(mapping, f, ensure_ascii=False, indent=2) + + +def retrieve_basic(query, basic_meta, output_dir, k=5): + index = faiss.read_index(os.path.join(output_dir, "basic.index")) + + with open(os.path.join(output_dir, "basic.mapping.json"), encoding="utf-8") as f: + mapping = json.load(f) + + vector = embed_texts([query])[0] + vector_np = np.array([vector]).astype("float32") + + distances, indices = index.search(vector_np, k) + + results = [] + for idx in indices[0]: + key = mapping[idx] + results.append((key, basic_meta[key])) + + return results + + +def retrieve_library_highlevel(query, libs_meta, output_dir, k=5): + index = faiss.read_index(os.path.join(output_dir, "libraries.index")) + + with open(os.path.join(output_dir, "libraries.mapping.json"), encoding="utf-8") as f: + mapping = json.load(f) + + vector = embed_texts([query])[0] + vector_np = np.array([vector]).astype("float32") + + distances, indices = index.search(vector_np, k) + + results = [] + for idx in indices[0]: + lib_name = mapping[idx] + results.append(libs_meta[lib_name]) + + return results + + +def retrieve_library_subpages(query, lib_name, libs_meta, output_dir, k=5): + index = faiss.read_index(os.path.join(output_dir, f"{lib_name}.subpages.index")) + + with open(os.path.join(output_dir, f"{lib_name}.subpages.mapping.json"), encoding="utf-8") as f: + mapping = json.load(f) + + vector = embed_texts([query])[0] + vector_np = np.array([vector]).astype("float32") + + distances, indices = index.search(vector_np, k) + + results = [] + for idx in indices[0]: + lib, page = mapping[idx] + results.append((page, libs_meta[lib]["subpages"][page])) + + return results + + +def build_all_vector_dbs(basic_meta: dict, libs_meta: dict): + vdbs_output_dir = cfg.get("VDBS_OUTPUT_DIR") + + print("Building basic vector DB...") + build_basic_vector_db(basic_meta, vdbs_output_dir) + print("Building library high-level vector DB...") + build_library_highlevel_db(libs_meta, vdbs_output_dir) + print("Building per-library subpage vector DBs...") + build_library_subpage_dbs(libs_meta, vdbs_output_dir) + + print("Build complete.") + + +if __name__ == "__main__": + import logging + + LOG_PATH = "vector_db_test.log" + + logging.basicConfig( + filename=LOG_PATH, + filemode="w", + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" + ) + + with open(cfg.get("BASIC_OUTPUT_PATH"), encoding="utf-8") as f: + basic_meta = json.load(f) + + with open(cfg.get("LIBRARIES_OUTPUT_PATH"), encoding="utf-8") as f: + libs_meta = json.load(f) + + build_all_vector_dbs(basic_meta, libs_meta) + + # --------------------------------------------------------- + # TESTS + # --------------------------------------------------------- + logging.info("=== Starting Vector DB Retrieval Tests ===") + + # ----------------------------- + # Test 1: Basic Docs Retrieval + # ----------------------------- + query1 = "function" + results1 = retrieve_basic(query1, basic_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3) + + logging.info("=== Test 1: Basic Docs Retrieval ===") + logging.info(f"Query: {query1}") + + for i, (key, item) in enumerate(results1, 1): + logging.info(f"\nResult {i}:") + logging.info(f"Key: {key}") + logging.info(f"Description: {item.get('description')}") + + # ----------------------------- + # Test 2: Library High-Level Retrieval + # ----------------------------- + query2 = "How to handle Google authentication in Alusus?" + results2 = retrieve_library_highlevel(query2, libs_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3) + + logging.info("\n=== Test 2: Library High-Level Retrieval ===") + logging.info(f"Query: {query2}") + + for i, item in enumerate(results2, 1): + logging.info(f"\nResult {i}:") + logging.info(f"Library: {item.get('name')}") + logging.info(f"Description: {item.get('description')}") + + # ----------------------------- + # Test 3: Subpage Retrieval for a Specific Library + # ----------------------------- + TEST_LIBRARY = "WebPlatform" + + if TEST_LIBRARY in libs_meta and libs_meta[TEST_LIBRARY].get("subpages"): + query3 = "How to create a webpage with a picture in Alusus?" + results3 = retrieve_library_subpages(query3, TEST_LIBRARY, libs_meta, cfg.get("VDBS_OUTPUT_DIR"), k=3) + + logging.info(f"\n=== Test 3: Subpage Retrieval for '{TEST_LIBRARY}' ===") + logging.info(f"Query: {query3}") + + for i, (page, item) in enumerate(results3, 1): + logging.info(f"\nResult {i}:") + logging.info(f"Subpage: {page}") + logging.info(f"Description: {item.get('description')}") + else: + logging.info(f"\nSkipping Test 3: Library '{TEST_LIBRARY}' has no subpages.") + + logging.info("=== Vector DB Retrieval Tests Completed ===") From cefaeeb8e32ae9657434cd11512a61e9d9863de0 Mon Sep 17 00:00:00 2001 From: KaisHasan Date: Wed, 14 Jan 2026 17:03:18 +0300 Subject: [PATCH 7/7] add readme.md file with instructions on how to run the code --- data/readme.md | 164 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 data/readme.md diff --git a/data/readme.md b/data/readme.md new file mode 100644 index 0000000..d339f79 --- /dev/null +++ b/data/readme.md @@ -0,0 +1,164 @@ +# 📘 Documentation Scraper & Vector Database Builder + +This folder contains the full pipeline for: + +- Scraping documentation from the Alusus website and GitHub repos +- Extracting metadata for basic docs and libraries +- Generating descriptions using OpenAI models +- Extracting notable functions and classes for each library using OpenAI models. +- Creating FAISS vector databases for retrieval + +The pipeline consists of three main scripts: + +1. `scraper.py` +2. `generate_description.py` +3. `create_vdbs.py` + +Each script depends on a shared `config.json` file. + +--- + +## 📦 Requirements + +Before running the pipeline, ensure that: + +- You have installed the full dependencies of the main repository +- Additionally, **BeautifulSoup (bs4)** must be installed: + +```bash +pip install beautifulsoup4 +``` + +or + +```bash +conda install conda-forge::beautifulsoup4 +``` + +This is required for HTML parsing during documentation scraping. + +--- + +## ⚙️ Configuration (`config.json`) + +You must create a `config.json` file in this folder before running any script. + +Use the following structure: + +```json +{ + "WEBSITE_DOCS_URLS": [ + "https://alusus.org/Documents/lang-reference.en.html", + "https://alusus.org/Documents/srt-reference.en.html" + ], + "BASIC_OUTPUT_PATH": "./basic_meta.json", + "LIBRARIES_META_URL": "https://alusus.org/Releases/libraries.json", + "LIBRARIES_OUTPUT_PATH": "./libs_meta.json", + "GITHUB_TOKEN": "...", + "OPENAI_API_KEY": "...", + "OPENAI_SUMMARISATION_MODEL": "gpt-4o-mini", + "SUMMARY_PROMPT_PATH": "./summary_gen_prompt.txt", + "OPENAI_FUNC_CLS_EXTRACTION_MODEL": "gpt-5.2-chat-latest", + "FUNC_CLS_EXCTRACTION_PROMPT_PATH": "./extract_func_cls_prompt.txt", + "EMBEDDING_MODEL": "text-embedding-3-large", + "VDBS_OUTPUT_DIR": "./vdbs/" +} +``` + +### 🔑 Required Keys + +- **GITHUB_TOKEN** + Used to query GitHub’s API for default branch names without hitting rate limits. + +- **OPENAI_API_KEY** + Required for generating summaries, extracting function/class metadata, and creating embeddings. + +--- + +## 🔐 Creating a GitHub Token + +GitHub requires authentication for higher API rate limits. +To create a token: + +1. Visit: + `https://github.com/settings/tokens?type=beta` +2. Click **“Generate new token”** +3. Choose **Fine-grained token** +4. Give it a name (e.g., `alusus-docs-scraper`) +5. Under **Repository access**, choose: + - **Public repositories** → Read-only +6. Generate the token +7. Copy it and paste it into your `config.json` under `"GITHUB_TOKEN"` + +This token allows the scraper to query repository metadata without hitting the unauthenticated rate limit. + +--- + +## ▶️ Running the Pipeline + +Before running the scripts, activate the full environment of the main repository. + +Then navigate to this folder and run the scripts in order: + +### 1. Scrape documentation and library metadata + +```bash +python scraper.py +``` + +This generates: + +- `basic_meta.json` +- `libs_meta.json` + +--- + +### 2. Generate descriptions using OpenAI + +```bash +python generate_description.py +``` + +This enriches the metadata with: + +- Summaries +- Function/class descriptions +- Cleaned content + +--- + +### 3. Create vector databases (FAISS) + +```bash +python create_vdbs.py +``` + +This produces: + +- A vector DB for basic docs +- A vector DB for library‑level descriptions +- A vector DB for each library with subpages + +All stored under the directory defined in: + +```json +"VDBS_OUTPUT_DIR": "./vdbs/" +``` + +--- + +## 📁 Output Structure + +After running all scripts, you will have: + +``` +basic_meta.json +libs_meta.json +vdbs/ + basic.index + basic.mapping.json + libraries.index + libraries.mapping.json + .subpages.index + .subpages.mapping.json +```