8000 Merge pull request #87 from KruxAI/ragbuilder_v2 · KruxAI/ragbuilder@5b08451 · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Commit 5b08451

Browse files
authored
Merge pull request #87 from KruxAI/ragbuilder_v2
Ragbuilder v2
2 parents 108e4a1 + 1430c9b commit 5b08451

26 files changed

+2739
-2595
lines changed

SampleInputFiles/docs-get_started-quickstart.md

Lines changed: 0 additions & 804 deletions
This file was deleted.

SampleInputFiles/docs-langsmith-walkthrough.md

Lines changed: 0 additions & 532 deletions
This file was deleted.

SampleInputFiles/docs-modules-agents-quick_start.md

Lines changed: 0 additions & 484 deletions
This file was deleted.

pyproject.toml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,19 @@ dependencies = [
1515
"datasets>=2.18.0",
1616
"fastapi>=0.100.0",
1717
"jinja2",
18-
"langchain>=0.1.0",
19-
"langchain-community==0.2.7",
20-
"langchain-core==0.2.20",
21-
"langchain-huggingface==0.0.3",
22-
"langchain-openai==0.1.17",
18+
"langchain>=0.3.24",
19+
"langchain-community==0.3.22",
20+
"langchain-core==0.3.55",
21+
"langchain-huggingface==0.1.2",
22+
"langchain-openai==0.3.14",
2323
"opentelemetry-api>=1.23.0",
2424
"opentelemetry-sdk>=1.23.0",
2525
"opentelemetry-exporter-otlp>=1.23.0",
2626
"optuna",
2727
"platformdirs",
2828
"pydantic>=2.0.0",
2929
"python-dotenv",
30-
"ragas==0.1.7",
30+
"ragas==0.2.14",
3131
"rerankers",
3232
"rich>=13.0.0",
3333
"sentence-transformers",

src/ragbuilder/config/base.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -82,20 +82,22 @@ def model_post_init(self, *args, **kwargs):
8282
if self.study_name is None:
8383
# Get the caller module name (data_ingest or retriever)
8484
frame = inspect.currentframe()
85+
caller_module = 'unknown'
86+
8587
while frame:
86-
module_name = inspect.getmodule(frame).__name__
87-
if 'data_ingest' in module_name:
88-
caller_module = 'data_ingest'
89-
break
90-
elif 'retriever' in module_name:
91-
caller_module = 'retriever'
92-
break
93-
elif 'generation' in module_name:
94-
caller_module = 'generation'
95-
break
88+
module = inspect.getmodule(frame)
89+
if module is not None: # Check if module is not None
90+
module_name = module.__name__
91+
if 'data_ingest' in module_name:
92+
caller_module = 'data_ingest'
93+
break
94+
elif 'retriever' in module_name:
95+
caller_module = 'retriever'
96+
break
97+
elif 'generation' in module_name:
98+
caller_module = 'generation'
99+
break
96100
frame = frame.f_back
97-
else:
98-
caller_module = 'unknown'
99101

100102
timestamp = int(time.time()*1000 + random.randint(1, 1000))
101103
self.study_name = f"{caller_module}_{timestamp}"

src/ragbuilder/config/components.py

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class LLMType(str, Enum):
3535
CUSTOM = "custom"
3636

3737
class ParserType(str, Enum):
38+
TEXT = "text"
3839
UNSTRUCTURED = "unstructured"
3940
PYMUPDF = "pymupdf"
4041
PYPDF = "pypdf"
@@ -54,6 +55,13 @@ class ChunkingStrategy(str, Enum):
5455
SEMANTIC = "SemanticChunker"
5556
CUSTOM = "custom"
5657

58+
NO_CHUNK_SIZE_STRATEGIES = [
59+
ChunkingStrategy.MARKDOWN,
60+
ChunkingStrategy.HTML,
61+
ChunkingStrategy.SEMANTIC,
62+
ChunkingStrategy.CUSTOM
63+
]
64+
5765
class EmbeddingType(str, Enum):
5866
OPENAI = "openai"
5967
AZURE_OPENAI = "azure_openai"
@@ -123,7 +131,7 @@ def get_class():
123131
LLM_MAP = {
124132
LLMType.OPENAI: lazy_load("langchain_openai", "ChatOpenAI"),
125133
LLMType.AZURE_OPENAI: lazy_load("langchain_openai", "AzureChatOpenAI"),
126-
LLMType.HUGGINGFACE: lazy_load("langchain_huggingface", "HuggingFaceHub"),
134+
LLMType.HUGGINGFACE: lazy_load("langchain_huggingface", "HuggingFaceEndpoint"),
127135
LLMType.OLLAMA: lazy_load("langchain_ollama", "OllamaChat"),
128136
LLMType.COHERE: lazy_load("langchain_community.llms", "Cohere"),
129137
LLMType.VERTEXAI: lazy_load("langchain_google_vertexai", "VertexAI"),
@@ -132,6 +140,8 @@ def get_class():
132140
}
133141

134142
LOADER_MAP = {
143+
# ParserType.UNSTRUCTURED: lazy_load("langchain_unstructured", "UnstructuredLoader"),
144+
ParserType.TEXT: lazy_load("langchain.document_loaders", "TextLoader"),
135145
ParserType.UNSTRUCTURED: lazy_load("langchain_community.document_loaders", "UnstructuredFileLoader"),
136146
ParserType.PYMUPDF: lazy_load("langchain_community.document_loaders", "PyMuPDFLoader"),
137147
ParserType.PYPDF: lazy_load("langchain_community.document_loaders", "PyPDFLoader"),
@@ -163,18 +173,18 @@ def get_class():
163173
}
164174

165175
VECTORDB_MAP = {
166-
VectorDatabase.FAISS: lazy_load("langchain.vectorstores", "FAISS"),
167-
VectorDatabase.CHROMA: lazy_load("langchain.vectorstores", "Chroma"),
168-
VectorDatabase.PINECONE: lazy_load("langchain.vectorstores", "Pinecone"),
169-
VectorDatabase.WEAVIATE: lazy_load("langchain.vectorstores", "Weaviate"),
170-
VectorDatabase.QDRANT: lazy_load("langchain.vectorstores", "Qdrant"),
171-
VectorDatabase.MILVUS: lazy_load("langchain.vectorstores", "Milvus"),
172-
VectorDatabase.PGVECTOR: lazy_load("langchain.vectorstores", "PGVector"),
173-
VectorDatabase.ELASTICSEARCH: lazy_load("langchain.vectorstores", "ElasticsearchStore"),
176+
VectorDatabase.FAISS: lazy_load("langchain_community.vectorstores", "FAISS"),
177+
VectorDatabase.CHROMA: lazy_load("langchain_chroma", "Chroma"),
178+
VectorDatabase.PINECONE: lazy_load("langchain_pinecone", "PineconeVectorStore"),
179+
VectorDatabase.WEAVIATE: lazy_load("langchain_weaviate.vectorstores", "WeaviateVectorStore"),
180+
VectorDatabase.QDRANT: lazy_load("langchain_qdrant", "QdrantVectorStore"),
181+
VectorDatabase.MILVUS: lazy_load("langchain_milvus", "Milvus"),
182+
VectorDatabase.PGVECTOR: lazy_load("langchain_postgres", "PGVector"),
183+
VectorDatabase.ELASTICSEARCH: lazy_load("langchain-elasticsearch", "ElasticsearchStore"),
174184
}
175185

176186
RETRIEVER_MAP = {
177-
RetrieverType.BM25: lazy_load("langchain.retrievers", "BM25Retriever"),
187+
RetrieverType.BM25: lazy_load("langchain_community.retrievers", "BM25Retriever"),
178188
}
179189

180190
RERANKER_MAP = {
@@ -226,6 +236,12 @@ def get_class():
226236

227237
# Environment variable requirements for components
228238
COMPONENT_ENV_REQUIREMENTS = {
239+
# Unstructured
240+
ParserType.UNSTRUCTURED: {
241+
"required": [],
242+
"optional": [],
243+
"packages": [_PkgSpec("langchain-unstructured")]
244+
},
229245
# Embedding Models
230246
EmbeddingType.AZURE_OPENAI: {
231247
"required": ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT"],
@@ -350,40 +366,59 @@ def get_class():
350366
VectorDatabase.PINECONE: {
351367
"required": ["PINECONE_API_KEY", "PINECONE_ENVIRONMENT"],
352368
"optional": [],
353-
"packages": [_PkgSpec("pinecone-client", "pinecone")]
369+
"packages": [
370+
_PkgSpec("langchain-pinecone"),
371+
_PkgSpec("pinecone-client", "pinecone")
372+
]
354373
},
355374
VectorDatabase.WEAVIATE: {
356375
"required": ["WEAVIATE_URL", "WEAVIATE_API_KEY"],
357376
"optional": [],
358-
"packages": [_PkgSpec("weaviate-client", "weaviate")]
377+
"packages": [
378+
_PkgSpec("weaviate-client", "weaviate"),
379+
_PkgSpec("langchain-weaviate")
380+
]
359381
},
360382
VectorDatabase.QDRANT: {
361383
"required": ["QDRANT_URL"],
362384
"optional": ["QDRANT_API_KEY"],
363-
"packages": [_PkgSpec("qdrant-client", "qdrant")]
385+
"packages": [
386+
_PkgSpec("qdrant-client", "qdrant"),
387+
_PkgSpec("langchain-qdrant")
388+
]
364389
},
365390
VectorDatabase.MILVUS: {
366391
"required": ["MILVUS_HOST", "MILVUS_PORT"],
367392
"optional": [],
368-
"packages": [_PkgSpec("pymilvus")]
393+
"packages": [
394+
_PkgSpec("pymilvus"),
395+
_PkgSpec("langchain-milvus")
396+
]
369397
},
370398
VectorDatabase.PGVECTOR: {
371399
"required": ["PGVECTOR_CONNECTION_STRING"],
372400
"optional": [],
373401
"packages": [
402+
_PkgSpec("langchain-postgres"),
374403
_PkgSpec("psycopg2-binary"),
375404
_PkgSpec("pgvector")
376405
]
377406
},
378407
VectorDatabase.ELASTICSEARCH: {
379408
"required": ["ELASTICSEARCH_URL"],
380409
"optional": ["ELASTICSEARCH_API_KEY"],
381-
"packages": [_PkgSpec("elasticsearch")]
410+
"packages": [
411+
_PkgSpec("elasticsearch"),
412+
_PkgSpec("langchain-elasticsearch")
413+
]
382414
},
383415
VectorDatabase.CHROMA: {
384416
"required": [],
385417
"optional": [],
386-
"packages": [_PkgSpec("chromadb")]
418+
"packages": [
419+
_PkgSpec("langchain-chroma"),
420+
_PkgSpec("chromadb")
421+
]
387422
},
388423
VectorDatabase.FAISS: {
389424
"required": [],

src/ragbuilder/config/generation.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ class PromptTemplate(BaseModel):
1717

1818
# Define the Execution Model for Each Question
1919
class QuestionContext(BaseModel):
20-
question: str
21-
ground_truth: str
20+
user_input: str
21+
reference: str
2222

2323

2424

@@ -40,23 +40,23 @@ class ExecutionResult(BaseModel):
4040

4141

4242
class EvalDatasetItem(BaseModel):
43-
question: str
44-
ground_truth: str
43+
user_input: str
44+
reference: str
4545
contexts: Optional[str] = None # Optional field
4646
evolution_type: Optional[str] = None
4747
metadata: Optional[str] = None
4848
episode_done: Optional[bool] = None
4949

50-
@field_validator('question', mode='before')
50+
@field_validator('user_input', mode='before')
5151
def check_question(cls, v):
5252
if not v.strip():
5353
raise ValueError('Question is required and cannot be empty.')
5454
return v
5555

56-
@field_validator('ground_truth', mode='before')
56+
@field_validator('reference', mode='before')
5757
def check_ground_truth(cls, v):
5858
if not v.strip():
59-
raise ValueError('Ground truth is required and cannot be empty.')
59+
raise ValueError('Reference is required and cannot be empty.')
6060
return v
6161

6262
class EvalDataset(BaseModel):

0 commit comments

Comments
 (0)
0