Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ graph TD
## Autor

**[@CuriousGu](https://www.github.com/CuriousGu) 🇧🇷**
**[@PrimeRibs2501](https://www.github.com/PrimeRibs2501) 🇧🇷**

## Docs
1. [Estrutura](docs/pt_br/ESTRUTURA.md)
Expand All @@ -70,3 +71,6 @@ Veja o arquivo [LICENSE](LICENSE) para mais detalhes.
- Email: gustavo_ortega@usp.br
- Linkedin: [Gustavo M. Ortega](https://www.linkedin.com/in/gustavomendoncaortega/)

- Email: pedro.hbueno03@usp.br
- Linkedin: [Pedro H. B. Ribeiro](https://www.linkedin.com/in//pedrohenriquebuenoribeiro)

2 changes: 1 addition & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ services:
attach: false

chroma:
image: chromadb/chroma:latest
image: chromadb/chroma:1.0.12
env_file:
- ../.env
ports:
Expand Down
18 changes: 15 additions & 3 deletions src/api/controllers/files.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from src.services.document_reader import DocumentReader
from src.infrastructure.database import ChromaDB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from src.infrastructure.config import settings
from src.api.models import FileMetadata

Expand All @@ -17,10 +19,20 @@ async def controller_upload_file(
metadata["extension"] = content["extension"]
metadata["file_name"] = content["name"]

langchain_docs = []

for j, text_content in enumerate(content["content"]):
doc = Document(
page_content=text_content,
metadata={
'extension' : content["extension"],
'file_name' : content["name"]

}
)
langchain_docs.append(doc)
await vector_store.add_documents(
documents=content["content"],
collection_name=settings.INDEX_NAME,
metadatas=[metadata for _ in content["content"]],
documents=langchain_docs
)
return True
except Exception as e:
Expand Down
34 changes: 13 additions & 21 deletions src/infrastructure/database/chromadb/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ def __init__(self):
)
try:
self.client = self._connect()
self.vector_store = Chroma(
client=self.client,
collection_name=self.collection_name,
embedding_function=self.embedding_function,
)
self.retriever = self._as_retriever()
except Exception as e:
raise ConnectionError(f"Failed to connect to ChromaDB: {e}")
Expand Down Expand Up @@ -56,35 +61,22 @@ def _as_retriever(self, collection_name: str = None, k: int = 10) -> dict:
"""
Cria um retriever adaptado à dimensão da coleção
"""
vector_store = Chroma(
client=self.client,
collection_name=collection_name or self.collection_name,
embedding_function=self.embedding_function,
)
retriever = vector_store.as_retriever(k=k)

retriever = self.vector_store.as_retriever(k=k)

return retriever

async def add_documents(
self,
documents: List[str],
collection_name: str,
metadatas: Optional[List[dict]] = None,
):
"""
Adiciona documentos à coleção com dimensão configurada no .env
"""
if not self.collection or self.collection.name != collection_name:
self.collection = self._create_collection(collection_name)

embeddings = self.embedding_function.embed_documents(documents)

return self.collection.add(
documents=documents,
embeddings=embeddings,
metadatas=metadatas,
ids=[str(uuid.uuid4()) for _ in enumerate(documents)],
)
return self.vector_store.add_documents(
documents=documents,
ids=[str(uuid.uuid4()) for _ in enumerate(documents)],
)

async def list_collections(self):
return self.client.list_collections()
Expand All @@ -94,7 +86,7 @@ async def list_documents(
collection_name: str
):
if collection := self.client.get_collection(collection_name):
return collection.get()
return collection.get(include=["metadatas", "documents", "embeddings"])
return []

async def query_documents(
Expand Down Expand Up @@ -164,7 +156,7 @@ def retrieve(query: str, n: int = 10) -> List:
# Limitando a quantidade de documentos retornados para n+10
# Isso garante que tenhamos documentos suficientes para filtrar
# posteriormente
response = retriever.invoke(query)
response = retriever.get_relevant_documents(query)

created_at = [
(index, x.metadata.get("created_at"))
Expand Down