feat: Enable adding files (#1864)

Co-authored-by: Matt Zhou <[email protected]>
cpacker · Oct 14, 2024 · 9b34769 · 9b34769
1 parent cc616ef
commit 9b34769
Show file tree

Hide file tree

Showing 26 changed files with 570 additions and 228 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,9 @@
 # Created by https://www.toptal.com/developers/gitignore/api/vim,linux,macos,pydev,python,eclipse,pycharm,windows,netbeans,pycharm+all,pycharm+iml,visualstudio,jupyternotebooks,visualstudiocode,xcode,xcodeinjection
 # Edit at https://www.toptal.com/developers/gitignore?templates=vim,linux,macos,pydev,python,eclipse,pycharm,windows,netbeans,pycharm+all,pycharm+iml,visualstudio,jupyternotebooks,visualstudiocode,xcode,xcodeinjection
 
+openapi_letta.json
+openapi_openai.json
+
 ### Eclipse ###
 .metadata
 bin/

diff --git a/docs/generate_docs.py b/docs/generate_docs.py
@@ -70,7 +70,7 @@ def generate_modules(config):
     "Message",
     "Passage",
     "AgentState",
-    "Document",
+    "File",
     "Source",
     "LLMConfig",
     "EmbeddingConfig",

diff --git a/examples/notebooks/data_connector.ipynb b/examples/notebooks/data_connector.ipynb
@@ -270,7 +270,7 @@
    "outputs": [],
    "source": [
     "from letta.data_sources.connectors import DataConnector \n",
-    "from letta.schemas.document import Document\n",
+    "from letta.schemas.file import FileMetadata\n",
     "from llama_index.core import Document as LlamaIndexDocument\n",
     "from llama_index.core import SummaryIndex\n",
     "from llama_index.readers.web import SimpleWebPageReader\n",

diff --git a/letta/__init__.py b/letta/__init__.py
@@ -7,9 +7,9 @@
 # imports for easier access
 from letta.schemas.agent import AgentState
 from letta.schemas.block import Block
-from letta.schemas.document import Document
 from letta.schemas.embedding_config import EmbeddingConfig
 from letta.schemas.enums import JobStatus
+from letta.schemas.file import FileMetadata
 from letta.schemas.job import Job
 from letta.schemas.letta_message import LettaMessage
 from letta.schemas.llm_config import LLMConfig

diff --git a/letta/agent_store/db.py b/letta/agent_store/db.py
@@ -28,7 +28,7 @@
 from letta.base import Base
 from letta.config import LettaConfig
 from letta.constants import MAX_EMBEDDING_DIM
-from letta.metadata import EmbeddingConfigColumn, ToolCallColumn
+from letta.metadata import EmbeddingConfigColumn, FileMetadataModel, ToolCallColumn
 
 # from letta.schemas.message import Message, Passage, Record, RecordType, ToolCall
 from letta.schemas.message import Message
@@ -141,7 +141,7 @@ class PassageModel(Base):
     id = Column(String, primary_key=True)
     user_id = Column(String, nullable=False)
     text = Column(String)
-    doc_id = Column(String)
+    file_id = Column(String)
     agent_id = Column(String)
     source_id = Column(String)
 
@@ -160,7 +160,7 @@ class PassageModel(Base):
     # Add a datetime column, with default value as the current time
     created_at = Column(DateTime(timezone=True))
 
-    Index("passage_idx_user", user_id, agent_id, doc_id),
+    Index("passage_idx_user", user_id, agent_id, file_id),
 
     def __repr__(self):
         return f"<Passage(passage_id='{self.id}', text='{self.text}', embedding='{self.embedding})>"
@@ -170,7 +170,7 @@ def to_record(self):
             text=self.text,
             embedding=self.embedding,
             embedding_config=self.embedding_config,
-            doc_id=self.doc_id,
+            file_id=self.file_id,
             user_id=self.user_id,
             id=self.id,
             source_id=self.source_id,
@@ -365,12 +365,17 @@ def __init__(self, table_type: str, config: LettaConfig, user_id, agent_id=None)
                 self.uri = self.config.archival_storage_uri
                 self.db_model = PassageModel
                 if self.config.archival_storage_uri is None:
-                    raise ValueError(f"Must specifiy archival_storage_uri in config {self.config.config_path}")
+                    raise ValueError(f"Must specify archival_storage_uri in config {self.config.config_path}")
             elif table_type == TableType.RECALL_MEMORY:
                 self.uri = self.config.recall_storage_uri
                 self.db_model = MessageModel
                 if self.config.recall_storage_uri is None:
-                    raise ValueError(f"Must specifiy recall_storage_uri in config {self.config.config_path}")
+                    raise ValueError(f"Must specify recall_storage_uri in config {self.config.config_path}")
+            elif table_type == TableType.FILES:
+                self.uri = self.config.metadata_storage_uri
+                self.db_model = FileMetadataModel
+                if self.config.metadata_storage_uri is None:
+                    raise ValueError(f"Must specify metadata_storage_uri in config {self.config.config_path}")
             else:
                 raise ValueError(f"Table type {table_type} not implemented")
 
@@ -487,8 +492,14 @@ def __init__(self, table_type: str, config: LettaConfig, user_id, agent_id=None)
             # TODO: eventually implement URI option
             self.path = self.config.recall_storage_path
             if self.path is None:
-                raise ValueError(f"Must specifiy recall_storage_path in config {self.config.recall_storage_path}")
+                raise ValueError(f"Must specify recall_storage_path in config.")
             self.db_model = MessageModel
+        elif table_type == TableType.FILES:
+            self.path = self.config.metadata_storage_path
+            if self.path is None:
+                raise ValueError(f"Must specify metadata_storage_path in config.")
+            self.db_model = FileMetadataModel
+
         else:
             raise ValueError(f"Table type {table_type} not implemented")
 

diff --git a/letta/agent_store/lancedb.py b/letta/agent_store/lancedb.py
@@ -24,7 +24,7 @@ class PassageModel(LanceModel):
             id: uuid.UUID
             user_id: str
             text: str
-            doc_id: str
+            file_id: str
             agent_id: str
             data_source: str
             embedding: Vector(config.default_embedding_config.embedding_dim)
@@ -37,7 +37,7 @@ def to_record(self):
                 return Passage(
                     text=self.text,
                     embedding=self.embedding,
-                    doc_id=self.doc_id,
+                    file_id=self.file_id,
                     user_id=self.user_id,
                     id=self.id,
                     data_source=self.data_source,

diff --git a/letta/agent_store/milvus.py b/letta/agent_store/milvus.py
@@ -26,7 +26,7 @@ def __init__(self, table_type: str, config: LettaConfig, user_id, agent_id=None)
             raise ValueError("Please set `archival_storage_uri` in the config file when using Milvus.")
 
         # need to be converted to strings
-        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "doc_id"]
+        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "file_id"]
 
     def _create_collection(self):
         schema = MilvusClient.create_schema(

diff --git a/letta/agent_store/qdrant.py b/letta/agent_store/qdrant.py
@@ -38,7 +38,7 @@ def __init__(self, table_type: str, config: LettaConfig, user_id, agent_id=None)
                     distance=models.Distance.COSINE,
                 ),
             )
-        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "doc_id"]
+        self.uuid_fields = ["id", "user_id", "agent_id", "source_id", "file_id"]
 
     def get_all_paginated(self, filters: Optional[Dict] = {}, page_size: int = 10) -> Iterator[List[RecordType]]:
         from qdrant_client import grpc

diff --git a/letta/agent_store/storage.py b/letta/agent_store/storage.py
@@ -10,7 +10,7 @@
 from pydantic import BaseModel
 
 from letta.config import LettaConfig
-from letta.schemas.document import Document
+from letta.schemas.file import FileMetadata
 from letta.schemas.message import Message
 from letta.schemas.passage import Passage
 from letta.utils import printd
@@ -22,7 +22,7 @@ class TableType:
     ARCHIVAL_MEMORY = "archival_memory"  # recall memory table: letta_agent_{agent_id}
     RECALL_MEMORY = "recall_memory"  # archival memory table: letta_agent_recall_{agent_id}
     PASSAGES = "passages"  # TODO
-    DOCUMENTS = "documents"  # TODO
+    FILES = "files"
 
 
 # table names used by Letta
@@ -33,17 +33,17 @@ class TableType:
 
 # external data source tables
 PASSAGE_TABLE_NAME = "letta_passages"  # chunked/embedded passages (from source)
-DOCUMENT_TABLE_NAME = "letta_documents"  # original documents (from source)
+FILE_TABLE_NAME = "letta_files"  # original files (from source)
 
 
 class StorageConnector:
-    """Defines a DB connection that is user-specific to access data: Documents, Passages, Archival/Recall Memory"""
+    """Defines a DB connection that is user-specific to access data: files, Passages, Archival/Recall Memory"""
 
     type: Type[BaseModel]
 
     def __init__(
         self,
-        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.DOCUMENTS],
+        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.FILES],
         config: LettaConfig,
         user_id,
         agent_id=None,
@@ -59,9 +59,9 @@ def __init__(
         elif table_type == TableType.RECALL_MEMORY:
             self.type = Message
             self.table_name = RECALL_TABLE_NAME
-        elif table_type == TableType.DOCUMENTS:
-            self.type = Document
-            self.table_name == DOCUMENT_TABLE_NAME
+        elif table_type == TableType.FILES:
+            self.type = FileMetadata
+            self.table_name = FILE_TABLE_NAME
         elif table_type == TableType.PASSAGES:
             self.type = Passage
             self.table_name = PASSAGE_TABLE_NAME
@@ -74,7 +74,7 @@ def __init__(
             # agent-specific table
             assert agent_id is not None, "Agent ID must be provided for agent-specific tables"
             self.filters = {"user_id": self.user_id, "agent_id": self.agent_id}
-        elif self.table_type == TableType.PASSAGES or self.table_type == TableType.DOCUMENTS:
+        elif self.table_type == TableType.PASSAGES or self.table_type == TableType.FILES:
             # setup base filters for user-specific tables
             assert agent_id is None, "Agent ID must not be provided for user-specific tables"
             self.filters = {"user_id": self.user_id}
@@ -83,7 +83,7 @@ def __init__(
 
     @staticmethod
     def get_storage_connector(
-        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.DOCUMENTS],
+        table_type: Union[TableType.ARCHIVAL_MEMORY, TableType.RECALL_MEMORY, TableType.PASSAGES, TableType.FILES],
         config: LettaConfig,
         user_id,
         agent_id=None,
@@ -92,6 +92,8 @@ def get_storage_connector(
             storage_type = config.archival_storage_type
         elif table_type == TableType.RECALL_MEMORY:
             storage_type = config.recall_storage_type
+        elif table_type == TableType.FILES:
+            storage_type = config.metadata_storage_type
         else:
             raise ValueError(f"Table type {table_type} not implemented")
 

diff --git a/letta/cli/cli_load.py b/letta/cli/cli_load.py
@@ -106,7 +106,7 @@ def load_vector_database(
     #            document_store=None,
     #            passage_store=passage_storage,
     #        )
-    #        print(f"Loaded {num_passages} passages and {num_documents} documents from {name}")
+    #        print(f"Loaded {num_passages} passages and {num_documents} files from {name}")
     #    except Exception as e:
     #        typer.secho(f"Failed to load data from provided information.\n{e}", fg=typer.colors.RED)
     #        ms.delete_source(source_id=source.id)

diff --git a/letta/client/client.py b/letta/client/client.py
@@ -25,6 +25,7 @@
 
 # new schemas
 from letta.schemas.enums import JobStatus, MessageRole
+from letta.schemas.file import FileMetadata
 from letta.schemas.job import Job
 from letta.schemas.letta_request import LettaRequest
 from letta.schemas.letta_response import LettaResponse, LettaStreamingResponse
@@ -232,6 +233,9 @@ def list_sources(self) -> List[Source]:
     def list_attached_sources(self, agent_id: str) -> List[Source]:
         raise NotImplementedError
 
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        raise NotImplementedError
+
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         raise NotImplementedError
 
@@ -1016,6 +1020,12 @@ def get_job(self, job_id: str) -> Job:
             raise ValueError(f"Failed to get job: {response.text}")
         return Job(**response.json())
 
+    def delete_job(self, job_id: str) -> Job:
+        response = requests.delete(f"{self.base_url}/{self.api_prefix}/jobs/{job_id}", headers=self.headers)
+        if response.status_code != 200:
+            raise ValueError(f"Failed to delete job: {response.text}")
+        return Job(**response.json())
+
     def list_jobs(self):
         response = requests.get(f"{self.base_url}/{self.api_prefix}/jobs", headers=self.headers)
         return [Job(**job) for job in response.json()]
@@ -1088,6 +1098,30 @@ def list_attached_sources(self, agent_id: str) -> List[Source]:
             raise ValueError(f"Failed to list attached sources: {response.text}")
         return [Source(**source) for source in response.json()]
 
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        """
+        List files from source with pagination support.
+
+        Args:
+            source_id (str): ID of the source
+            limit (int): Number of files to return
+            cursor (Optional[str]): Pagination cursor for fetching the next page
+
+        Returns:
+            List[FileMetadata]: List of files
+        """
+        # Prepare query parameters for pagination
+        params = {"limit": limit, "cursor": cursor}
+
+        # Make the request to the FastAPI endpoint
+        response = requests.get(f"{self.base_url}/{self.api_prefix}/sources/{source_id}/files", headers=self.headers, params=params)
+
+        if response.status_code != 200:
+            raise ValueError(f"Failed to list files with source id {source_id}: [{response.status_code}] {response.text}")
+
+        # Parse the JSON response
+        return [FileMetadata(**metadata) for metadata in response.json()]
+
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         """
         Update a source
@@ -2162,6 +2196,9 @@ def load_file_into_source(self, filename: str, source_id: str, blocking=True):
     def get_job(self, job_id: str):
         return self.server.get_job(job_id=job_id)
 
+    def delete_job(self, job_id: str):
+        return self.server.delete_job(job_id)
+
     def list_jobs(self):
         return self.server.list_jobs(user_id=self.user_id)
 
@@ -2261,6 +2298,20 @@ def list_attached_sources(self, agent_id: str) -> List[Source]:
         """
         return self.server.list_attached_sources(agent_id=agent_id)
 
+    def list_files_from_source(self, source_id: str, limit: int = 1000, cursor: Optional[str] = None) -> List[FileMetadata]:
+        """
+        List files from source.
+
+        Args:
+            source_id (str): ID of the source
+            limit (int): The # of items to return
+            cursor (str): The cursor for fetching the next page
+
+        Returns:
+            files (List[FileMetadata]): List of files
+        """
+        return self.server.list_files_from_source(source_id=source_id, limit=limit, cursor=cursor)
+
     def update_source(self, source_id: str, name: Optional[str] = None) -> Source:
         """
         Update a source