diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..0165352 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.10 + + +# Add non-root user +ARG USERNAME=nonroot +RUN groupadd --gid 1000 $USERNAME && \ + useradd --uid 1000 --gid 1000 -m $USERNAME +## Make sure to reflect new user in PATH +ENV PATH="/home/${USERNAME}/.local/bin:${PATH}" +USER $USERNAME + +# Upgrade pip +RUN pip install --upgrade pip + +# Install production and dev dependencies +COPY --chown=nonroot:1000 requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt && \ + rm /tmp/requirements.txt diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..ea8df0f --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +{ + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + "remoteUser": "nonroot", + "portsAttributes": { + "5005": { + "label": "flask", + "onAutoForward": "openBrowser" + } + }, + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-azuretools.vscode-docker", + "github.copilot" + ], + "settings": { + "terminal.integrated.defaultProfile.linux": "bash" + } + } + }, + + "forwardPorts": [ + 5005 + ] +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 2eea525..c45936d 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.env \ No newline at end of file +.env +repos \ No newline at end of file diff --git a/Github.code-workspace b/Github.code-workspace deleted file mode 100644 index 362d7c2..0000000 --- a/Github.code-workspace +++ /dev/null @@ -1,7 +0,0 @@ -{ - "folders": [ - { - "path": "." - } - ] -} \ No newline at end of file diff --git a/app.py b/app.py index b526022..a8cebe6 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,5 @@ from flask import Flask, render_template, request, jsonify import os -import getpass from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import DeepLake from langchain.chat_models import ChatOpenAI @@ -16,7 +15,7 @@ ACTIVELOOP_TOKEN = os.getenv('ACTIVELOOP_TOKEN') embeddings = OpenAIEmbeddings(disallowed_special=()) -db = DeepLake(dataset_path="hub://davitbun/twitter-algorithm", read_only=True, embedding_function=embeddings) +db = DeepLake(dataset_path="hub://theodoremeynard/ddataflow", read_only=True, embedding_function=embeddings) retriever = db.as_retriever() retriever.search_kwargs['distance_metric'] = 'cos' retriever.search_kwargs['fetch_k'] = 100 @@ -50,4 +49,4 @@ def ask_question(): return jsonify({"question": question, "answer": result['answer']}) if __name__ == "__main__": - app.run(debug=True) + app.run(debug=True,port=5005) diff --git a/readme.md b/readme.md index 3f2eb8a..ce9ebfe 100644 --- a/readme.md +++ b/readme.md @@ -1,6 +1,6 @@ -# Twitter Algorithm Chatbot +# Algorithm Chatbot -This is a simple chatbot that can answer questions about the [Twitter algorithm](https://github.com/twitter/the-algorithm). It is built using Python, HTML, CSS, and JavaScript. +This is a simple chatbot that can answer questions about an algorithm like [twitter algorithm](https://github.com/twitter/the-algorithm). It is built using Python, HTML, CSS, and JavaScript. Please note that since we are using GPT-4, the response times will be higher and every query will cost more than GPT-3.5. @@ -25,7 +25,22 @@ cd GitGPT python app.py ``` -The chatbot interface will appear, allowing you to ask questions about the Twitter algorithm. +You can also directly use a devcontainer in vscode by clicking on the icon below + +[ + ![Open in Remote - Containers]( + https://img.shields.io/static/v1?label=Remote%20-%20Containers&message=Open&color=blue&logo=visualstudiocode + ) +]( + https://vscode.dev/redirect?url=vscode://ms-vscode-remote.remote-containers/cloneInVolume?url=https://github.com/theodoremeynard/GitGPT +) + +and then you just need to run +```bash +python app.py +``` + +The chatbot interface will appear, allowing you to ask questions about the algorithm. Enter your question in the input field and click the "Send" button or press the "Enter" key to submit your query. The chatbot will display a "Thinking..." message while it processes your request, and then it will display the answer to your question. diff --git a/requirements.txt b/requirements.txt index 51ecb68..26ed8d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ Flask==2.0.1 -langchain==0.0.12 -dotenv==0.17.1 \ No newline at end of file +langchain==0.0.170 +python-dotenv==1.0.0 +openai==0.27.6 +deeplake==3.4.3 +tiktoken==0.4.0 \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index 32c6013..106f892 100644 --- a/templates/index.html +++ b/templates/index.html @@ -11,7 +11,7 @@
- +
diff --git a/upload.py b/upload.py new file mode 100644 index 0000000..7aeb97d --- /dev/null +++ b/upload.py @@ -0,0 +1,54 @@ +import subprocess +import os +from langchain.document_loaders import TextLoader +from langchain.text_splitter import CharacterTextSplitter +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import DeepLake + +from dotenv import load_dotenv + +load_dotenv() + +def clone_repo(repo_url, location=""): + """ + Clone a git repository into a specified location. + + :param repo_url: The URL of the repository to clone. + :param location: The location to clone the repository into. Default is the current directory. + """ + subprocess.run(["git", "clone", repo_url, location]) + +def prepare_data(root_dir): + """ + Prepare data from a root directory + """ + docs = [] + for dirpath, _, filenames in os.walk(root_dir): + for file in filenames: + if file.endswith('.py') or file.endswith(".md"): + try: + loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8') + docs.extend(loader.load_and_split()) + except Exception as e: + pass + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + texts = text_splitter.split_documents(docs) + print(f"{len(texts)}") + return texts + +def push_data_to_deeplake(texts, dataset_path): + """ + Push data to deeplake + """ + embeddings = OpenAIEmbeddings() + + db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path) + return db + + + +# example usage +if __name__ == "__main__": + clone_repo("https://github.com/getyourguide/DDataFlow.git", "./repos/DDataFlow") + texts = prepare_data("./repos/DDataFlow") + db = push_data_to_deeplake(texts, "hub://theodoremeynard/ddataflow")