diff --git a/environment.yml b/environment.yml index aa9680c..7445045 100644 --- a/environment.yml +++ b/environment.yml @@ -18,6 +18,9 @@ dependencies: - streamlit - panel - pip + - nltk + - ratelimit + - backoff - pip: - jupyter-panel-proxy==0.2.0a2 - qdrant-client diff --git a/github-rag.ipynb b/github-rag.ipynb new file mode 100644 index 0000000..a5fcd94 --- /dev/null +++ b/github-rag.ipynb @@ -0,0 +1,525 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Retrieval Augmented Generation - make your own local chat from github docs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Intro\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Baking our final results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "astro_ph_df = pd.read_pickle(\"resources/data/astro-ph-arXiv-abstracts.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/anaconda3/envs/ssec-usrse2024/lib/python3.11/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import tqdm, trange\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading existing Qdrant collection 'arxiv_astro-ph_abstracts'\n" + ] + } + ], + "source": [ + "from langchain import PromptTemplate\n", + "from llama_cpp import Llama\n", + "\n", + "from langchain_community.llms import LlamaCpp\n", + "from langchain_core.callbacks import StreamingStdOutCallbackHandler\n", + "\n", + "from ssec_usrse2024 import OLMO_MODEL\n", + "\n", + "from langchain_community.embeddings import HuggingFaceEmbeddings\n", + "from sentence_transformers import SentenceTransformer, util\n", + "\n", + "import os\n", + "\n", + "from langchain_community.vectorstores import Qdrant\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "\n", + "from qdrant_client import QdrantClient\n", + "\n", + "olmo = LlamaCpp(\n", + " model_path=str(OLMO_MODEL),\n", + " temperature=0.8,\n", + " verbose=False, \n", + ")\n", + "\n", + "prompt_template = PromptTemplate.from_template(\n", + " template=olmo.client.metadata['tokenizer.chat_template'], \n", + " template_format=\"jinja2\"\n", + ")\n", + "\n", + "\n", + "\n", + "def ask_question(question):\n", + " print(f\"\\n\\n------------------------------------------\\nQuestion: {question}\")\n", + " messages = [\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an astrophysics expert. Please answer the following question on astrophysics. \n", + " Question: {question}\"\"\"\n", + " }\n", + " ]\n", + " return llm_chain.invoke(\n", + " {\n", + " \"messages\": messages, \n", + " \"add_generation_prompt\": True, \n", + " \"eos_token\": \"<|endoftext|>\",\n", + " },\n", + " config={\n", + " 'callbacks' : [StreamingStdOutCallbackHandler()]\n", + " }\n", + " )\n", + "\n", + "model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "\n", + "# TODO: Fix module paths\n", + "qdrant_path = \"resources/data/qdrant/scipy_qdrant/\"\n", + "\n", + "# TODO: Change collection name to \n", + "qdrant_collection = \"arxiv_astro-ph_abstracts\"\n", + "\n", + "if os.path.exists(qdrant_path):\n", + " print(f\"Loading existing Qdrant collection '{qdrant_collection}'\")\n", + " \n", + " client = QdrantClient(path=qdrant_path)\n", + " \n", + " qdrant = Qdrant(\n", + " client=client,\n", + " collection_name=qdrant_collection,\n", + " embeddings=model\n", + " )\n", + "\n", + "retriever = qdrant.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 2})\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Dark matter is a theoretical particle that, according to contemporary astrophysical observations, makes up approximately 80% of the total mass in the universe (1). This mysterious substance has no illuminating properties, meaning it does not emit, reflect, or refract light. In other words, dark matter is invisible to our current technology and understanding.\n", + "\n", + "While its properties are still largely unknown, scientists suggest that it may be made up of stars and black holes that have cooled down to the point where they do not emit light anymore (2). This idea, called the Warm Dark Matter (WDM) model, aims to provide a more comprehensive picture of dark matter's distribution and properties compared to the Cold Dark Matter (CDM) hypothesis.\n", + "\n", + "Despite numerous theories and research in this area, understanding the nature and origin of dark matter remains one of the greatest unsolved mysteries in cosmology today. This review is aimed at providing an accessible yet rigorous introduction for advanced students and researchers new to the field of dark matter, offering extensive references for further exploration (3).\n", + "\n", + "(1) \"Dark Matter: The Remnant Matter in the Universe\" by B. P. Lee et al., Journal of Cosmology and Astroparticle Physics, Vol. 8, No. 1, 2018;" + ] + }, + { + "data": { + "text/plain": [ + "' Dark matter is a theoretical particle that, according to contemporary astrophysical observations, makes up approximately 80% of the total mass in the universe (1). This mysterious substance has no illuminating properties, meaning it does not emit, reflect, or refract light. In other words, dark matter is invisible to our current technology and understanding.\\n\\nWhile its properties are still largely unknown, scientists suggest that it may be made up of stars and black holes that have cooled down to the point where they do not emit light anymore (2). This idea, called the Warm Dark Matter (WDM) model, aims to provide a more comprehensive picture of dark matter\\'s distribution and properties compared to the Cold Dark Matter (CDM) hypothesis.\\n\\nDespite numerous theories and research in this area, understanding the nature and origin of dark matter remains one of the greatest unsolved mysteries in cosmology today. This review is aimed at providing an accessible yet rigorous introduction for advanced students and researchers new to the field of dark matter, offering extensive references for further exploration (3).\\n\\n(1) \"Dark Matter: The Remnant Matter in the Universe\" by B. P. Lee et al., Journal of Cosmology and Astroparticle Physics, Vol. 8, No. 1, 2018;'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "question = \"What is dark matter?\"\n", + "\n", + "context = format_docs(retriever.invoke(question))\n", + "\n", + "prompt_template.format(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at astrophysics. Please answer the question on astrophysics based on the following context:\n", + "\n", + " Context: {context}\n", + " \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " add_generation_prompt=True, \n", + " eos_token=\"<|endoftext|>\"\n", + ")\n", + "\n", + "llm_chain = prompt_template | olmo\n", + "\n", + "llm_chain.invoke(\n", + " {\n", + " \"messages\":\n", + " [{\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at astrophysics. Please answer the question on astrophysics based on the following context:\n", + " \n", + " Context: {context}\n", + " \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " \"add_generation_prompt\": True, \n", + " \"eos_token\": \"<|endoftext|>\",\n", + " },\n", + " config={\n", + " 'callbacks' : [StreamingStdOutCallbackHandler()]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dark matter is a theoretical particle that exists within the framework of astronomical and cosmological physics, though it has not yet been directly detected through laboratory experiments or other means. Dark matter makes up approximately 85% of the matter in the universe, while visible matter (which includes stars, planets, gases, and dust) only accounts for about 5% of the total mass-energy density of the observable universe.\n", + "\n", + "This discrepancy between visible matter and dark matter is known as the \"missing mass\" problem. Dark matter is believed to interact with ordinary matter through gravity only, and it does not emit or absorb light, making its detection extremely challenging. Researchers use various methods, such as studying the motions of galaxies and their orbits around the center of the Milky Way, to infer the existence and properties of dark matter.\n", + "\n", + "Despite extensive searching over the last few decades, no direct evidence for dark matter particles has yet been discovered. However, the observation that dark matter makes up a significant fraction of the universe suggests its existence is a widely accepted concept within astrophysics." + ] + }, + { + "data": { + "text/plain": [ + "'Dark matter is a theoretical particle that exists within the framework of astronomical and cosmological physics, though it has not yet been directly detected through laboratory experiments or other means. Dark matter makes up approximately 85% of the matter in the universe, while visible matter (which includes stars, planets, gases, and dust) only accounts for about 5% of the total mass-energy density of the observable universe.\\n\\nThis discrepancy between visible matter and dark matter is known as the \"missing mass\" problem. Dark matter is believed to interact with ordinary matter through gravity only, and it does not emit or absorb light, making its detection extremely challenging. Researchers use various methods, such as studying the motions of galaxies and their orbits around the center of the Milky Way, to infer the existence and properties of dark matter.\\n\\nDespite extensive searching over the last few decades, no direct evidence for dark matter particles has yet been discovered. However, the observation that dark matter makes up a significant fraction of the universe suggests its existence is a widely accepted concept within astrophysics.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "question = \"What is dark matter?\"\n", + "\n", + "prompt_template.format(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at astrophysics. Please answer the question on astrophysics. \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " add_generation_prompt=True, \n", + " eos_token=\"<|endoftext|>\"\n", + ")\n", + "\n", + "llm_chain.invoke(\n", + " {\n", + " \"messages\":\n", + " [\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at astrophysics. Please answer the question on astrophysics. \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " \"add_generation_prompt\": True, \n", + " \"eos_token\": \"<|endoftext|>\",\n", + " },\n", + " config={\n", + " 'callbacks' : [StreamingStdOutCallbackHandler()]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Fix module paths\n", + "qdrant_path2 = \"resources/data/qdrant/usrse_qdrant/\"\n", + "\n", + "# TODO: Change collection name to \n", + "qdrant_collection2 = \"astropy_docs\"\n", + "\n", + "if os.path.exists(qdrant_path):\n", + " print(f\"Loading existing Qdrant collection '{qdrant_collection2}'\")\n", + " \n", + " client2 = QdrantClient(path=qdrant_path2)\n", + " \n", + " qdrant2 = Qdrant(\n", + " client=client2,\n", + " collection_name=qdrant_collection2,\n", + " embeddings=model\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "retriever = qdrant2.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 2})\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs[0])\n", + " #return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "\n", + "\n", + "question = \"How can I perform celestial coordinate transformations?\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = retriever.invoke(question)\n", + "\n", + "context = format_docs(retriever.invoke(question))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(context.split()) *3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "context = retriever.invoke(question)[0]\n", + "\n", + "print(len(prompt_template.format(\n", + " messages=[\n", + " {\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at the astrophysics package Astropy. Please answer the question on Astropy based on the following context:\n", + "\n", + " Context: {context}\n", + " \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " add_generation_prompt=True, \n", + " eos_token=\"<|endoftext|>\"\n", + ").split()))\n", + "\n", + "print(prompt_template.template)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "llm_chain = prompt_template | olmo\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(llm_chain)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# llm_chain.invoke(\n", + "# {\n", + "# \"messages\":\n", + "# [\n", + "# ], \n", + "# \"add_generation_prompt\": True, \n", + "# \"eos_token\": \"<|endoftext|>\",\n", + "# },\n", + "# config={\n", + "# 'callbacks' : [StreamingStdOutCallbackHandler()]\n", + "# }\n", + "# )\n", + "\n", + "from langchain.retrievers import ContextualCompressionRetriever\n", + "from langchain.retrievers.document_compressors import LLMChainExtractor\n", + "\n", + "\n", + "compressor = LLMChainExtractor.from_llm(olmo)\n", + "\n", + "compression_retriever = ContextualCompressionRetriever(\n", + " base_compressor=compressor, base_retriever=retriever\n", + ")\n", + "\n", + "compressed_docs = compression_retriever.invoke(\"How can I perform celestial coordinate transformations?\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "\n", + "llm_chain.invoke(\n", + " {\n", + " \"messages\":\n", + " [{\n", + " \"role\": \"user\", \n", + " \"content\": f\"\"\"You are an expert at the astrophysics package Astropy. Please answer the question on Astropy based on the following context:\n", + " \n", + " Context: {context}\n", + " \n", + " Question: {question}\"\"\"\n", + " }\n", + " ], \n", + " \"add_generation_prompt\": True, \n", + " \"eos_token\": \"<|endoftext|>\",\n", + " },\n", + " config={\n", + " 'callbacks' : [StreamingStdOutCallbackHandler()]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Retrieving Github docs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Vectorization and Embedding\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Retrieval\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Prompting\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Final Results\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## To see more, visit SSEC Tutorials! :D <3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ssec-usrse2024", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/qdrant-vector-database-creation.ipynb b/qdrant-vector-database-creation.ipynb new file mode 100644 index 0000000..60069e2 --- /dev/null +++ b/qdrant-vector-database-creation.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce91ba40-0f53-4fc0-b687-a385a4c36a10", + "metadata": {}, + "source": [ + "# Qdrant Vector Database Creation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26843ee7-eb3f-48df-bee7-125b994ce25e", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "\n", + "from getpass import getpass\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.schema import Document\n", + "from langchain_community.vectorstores import Qdrant\n", + "from llama_cpp import Llama\n", + "#import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from tqdm.notebook import tqdm\n", + "from qdrant_client import QdrantClient" + ] + }, + { + "cell_type": "markdown", + "id": "6446cc91-5e3a-429f-9ead-a38bbf1cf230", + "metadata": {}, + "source": [ + "## GitHub Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cee387e-ad9a-49a8-9ced-7ae8f9b8d9a0", + "metadata": {}, + "outputs": [], + "source": [ + "# Enter your GitHub Personal Access Token securely\n", + "ACCESS_TOKEN=getpass(prompt=\"GitHub Personal Access Token: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5f55fb43-c229-4acc-936a-87f20d20d8d1", + "metadata": {}, + "outputs": [], + "source": [ + "#@limits(calls=, period=60)\n", + "def fetch_file(file_url):\n", + " response = requests.get(file_url, headers={'Accept': 'application/vnd.github.v3.raw', 'Authorization': f'token {ACCESS_TOKEN}'})\n", + " response.raise_for_status()\n", + " return response\n", + "\n", + "#@limits(calls=100, period=60)\n", + "def fetch_folder(base_url, headers):\n", + " response = requests.get(base_url, headers=headers)\n", + " response.raise_for_status() # This will raise an error for failed requests\n", + " return response\n", + "\n", + "\n", + "def fetch_and_process_rst_files(repo, branch, path):\n", + " \"\"\"\n", + " Recursively fetch and process RST files from a GitHub repository.\n", + " \"\"\"\n", + " base_url = f\"https://api.github.com/repos/{repo}/contents/{path}?ref={branch}\"\n", + " headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {ACCESS_TOKEN}'}\n", + " response = fetch_folder(base_url, headers)\n", + " files = response.json()\n", + "\n", + " documents = []\n", + " for file in files:\n", + " if file['type'] == 'dir': # This is a directory; recurse into it\n", + " documents.extend(fetch_and_process_rst_files(repo, branch, file['path']))\n", + " elif file['name'].endswith('.rst'):\n", + " file_url = file['download_url']\n", + " response = fetch_file(file_url)\n", + " title = file['name'].replace('.rst', '').replace('_', ' ').title()\n", + " documents.append(Document(page_content=response.text, metadata={\"title\": title, \"url\": file_url}))\n", + "\n", + " return documents\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f28805cc-c3c5-4ad4-969e-190cc55b0bcb", + "metadata": {}, + "outputs": [], + "source": [ + "# Usage example\n", + "repository = 'boto/boto3'\n", + "branch = 'develop'\n", + "docs_path = 'docs'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e848a168-b3a9-48c8-a6ee-34940a18db2f", + "metadata": {}, + "outputs": [], + "source": [ + "github_documents = fetch_and_process_rst_files(\n", + " repo=repository, \n", + " branch=branch, \n", + " path=docs_path,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef991fd7-fa91-4fb3-b189-cc3d521f6832", + "metadata": {}, + "outputs": [], + "source": [ + "len(github_documents)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9500fc61", + "metadata": {}, + "outputs": [], + "source": [ + "github_documents[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e744a104", + "metadata": {}, + "outputs": [], + "source": [ + "gh_s = pd.Series(github_documents)\n", + "gh_s.to_pickle('resources/data/boto3_docs.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6adbd0d", + "metadata": {}, + "outputs": [], + "source": [ + "gh_df.to_list()" + ] + }, + { + "cell_type": "markdown", + "id": "a5730eb8-7738-4099-be24-a5d2c1f4fd90", + "metadata": {}, + "source": [ + "### Documents Loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a662354", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "44d819f9-df1f-4fb1-9b82-5c438a274873", + "metadata": {}, + "source": [ + "LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.\n", + "\n", + "Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e403f62-fc91-4be9-9d45-292190c221fd", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import DataFrameLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd8cd94b-474e-49a5-956e-27f89e65fca4", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the dataframe full of abstracts\n", + "# to memory in the form of LangChain Document objects\n", + "loader = DataFrameLoader(astro_df, page_content_column=\"abstract\") \n", + "astrophysics_abstracts_documents = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8de58aaa-4cae-45b7-bf55-dc00739c94a8", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Number of astrophysics papers: \", len(astrophysics_abstracts_documents))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "138995f7-5bba-43b9-ba7a-ab8ef9b16c21", + "metadata": {}, + "outputs": [], + "source": [ + "all_documents = github_documents\n", + "print(\"Total Number of Documents: \", len(all_documents))" + ] + }, + { + "cell_type": "markdown", + "id": "16d1ebe7-8e12-4e2f-97d2-55074673aa37", + "metadata": {}, + "source": [ + "## Qdrant Creation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdfe9f2e", + "metadata": {}, + "outputs": [], + "source": [ + "gh_df = pd.read_pickle('resources/data/boto3_docs.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "901dcab4-2a76-48d3-94b8-01b571311d79", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: Fix the path\n", + "qdrant_path = \"resources/data/qdrant/usrse_qdrant/\"\n", + "qdrant_collection = \"boto3_docs\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f2d75f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be5c185b-c033-4058-989e-2b7c48138195", + "metadata": {}, + "outputs": [], + "source": [ + "model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8791db7", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_text_splitters import MarkdownTextSplitter\n", + "\n", + "#text_splitter = MarkdownTextSplitter(chunk_size=512, chunk_overlap=0)\n", + "text_splitter = MarkdownTextSplitter()\n", + "texts = text_splitter.split_documents(gh_df)\n", + "\n", + "print(\"Number of text chunks: \", len(texts))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6202ff82-c219-4c17-b960-c6d34231db08", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Creating new Qdrant collection '{qdrant_collection}' from {len(texts)} documents\")\n", + " \n", + "# Load the documents into a Qdrant Vector Database Collection\n", + "# this will save locally in the current directory as sqlite\n", + "qdrant = Qdrant.from_documents(\n", + " documents=texts,\n", + " embedding=model,\n", + " path=qdrant_path,\n", + " collection_name=qdrant_collection,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "578b5a20-a2e9-4006-a514-9a544f4e502e", + "metadata": {}, + "outputs": [], + "source": [ + "# Setup the retriever for later step\n", + "retriever = qdrant.as_retriever(search_type=\"mmr\", search_kwargs={\"k\": 2})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "323b976a-c7cc-4d86-9050-311b926d22c7", + "metadata": {}, + "outputs": [], + "source": [ + "retriever.invoke(\"How can I create an SQS queue?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e668bc7c-2eba-4329-aa4b-5c9866885f55", + "metadata": {}, + "outputs": [], + "source": [ + "retriever.invoke(\"How can I perform celestial coordinate transformations?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41f53e82-eee2-452a-932f-a5081c7fbe3c", + "metadata": {}, + "outputs": [], + "source": [ + "# Post-processing\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b189a1f7-a7b0-456f-ac34-794fec521617", + "metadata": {}, + "outputs": [], + "source": [ + "print(format_docs(retriever.invoke(\"How do I create a DynamoDB table?\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2403e158-85b8-4a5a-aa0f-8428fb133d9d", + "metadata": {}, + "outputs": [], + "source": [ + "qdrant = None" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/resources/data/astropy_docs.pkl b/resources/data/astropy_docs.pkl new file mode 100644 index 0000000..0c94bc5 Binary files /dev/null and b/resources/data/astropy_docs.pkl differ