Skip to content

Commit

Permalink
[Feature/Improvements] Delete data sources from metadata db when usin…
Browse files Browse the repository at this point in the history
…g `app.delete()` (#1286)
  • Loading branch information
deshraj authored Feb 26, 2024
1 parent 92dd7ed commit 752f638
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 7 deletions.
2 changes: 2 additions & 0 deletions embedchain/config/vectordb/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
pod_config: Optional[dict[str, any]] = None,
serverless_config: Optional[dict[str, any]] = None,
hybrid_search: bool = False,
bm25_encoder: any = None,
**extra_params: dict[str, any],
):
self.metric = metric
Expand All @@ -24,6 +25,7 @@ def __init__(
self.vector_dimension = vector_dimension
self.extra_params = extra_params
self.hybrid_search = hybrid_search
self.bm25_encoder = bm25_encoder
if pod_config is None and serverless_config is None:
# If no config is provided, use the default pod spec config
pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")
Expand Down
19 changes: 15 additions & 4 deletions embedchain/embedchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,20 @@
from dotenv import load_dotenv
from langchain.docstore.document import Document

from embedchain.cache import adapt, get_gptcache_session, gptcache_data_convert, gptcache_update_cache_callback
from embedchain.cache import (adapt, get_gptcache_session,
gptcache_data_convert,
gptcache_update_cache_callback)
from embedchain.chunkers.base_chunker import BaseChunker
from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
from embedchain.config.base_app_config import BaseAppConfig
from embedchain.core.db.models import DataSource
from embedchain.core.db.models import ChatHistory, DataSource
from embedchain.data_formatter import DataFormatter
from embedchain.embedder.base import BaseEmbedder
from embedchain.helpers.json_serializable import JSONSerializable
from embedchain.llm.base import BaseLlm
from embedchain.loaders.base_loader import BaseLoader
from embedchain.models.data_type import DataType, DirectDataType, IndirectDataType, SpecialDataType
from embedchain.models.data_type import (DataType, DirectDataType,
IndirectDataType, SpecialDataType)
from embedchain.utils.misc import detect_datatype, is_valid_json_string
from embedchain.vectordb.base import BaseVectorDB

Expand Down Expand Up @@ -642,9 +645,10 @@ def reset(self):
"""
try:
self.db_session.query(DataSource).filter_by(app_id=self.config.id).delete()
self.db_session.query(ChatHistory).filter_by(app_id=self.config.id).delete()
self.db_session.commit()
except Exception as e:
logging.error(f"Error deleting chat history: {e}")
logging.error(f"Error deleting data sources: {e}")
self.db_session.rollback()
return None
self.db.reset()
Expand Down Expand Up @@ -682,6 +686,13 @@ def delete(self, source_id: str):
:param source_hash: The hash of the source.
:type source_hash: str
"""
try:
self.db_session.query(DataSource).filter_by(hash=source_id, app_id=self.config.id).delete()
self.db_session.commit()
except Exception as e:
logging.error(f"Error deleting data sources: {e}")
self.db_session.rollback()
return None
self.db.delete(where={"hash": source_id})
logging.info(f"Successfully deleted {source_id}")
# Send anonymous telemetry
Expand Down
3 changes: 1 addition & 2 deletions embedchain/vectordb/pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,8 @@ def __init__(
# Setup BM25Encoder if sparse vectors are to be used
self.bm25_encoder = None
if self.config.hybrid_search:
# TODO: Add support for fitting BM25Encoder on any corpus
logging.info("Initializing BM25Encoder for sparse vectors..")
self.bm25_encoder = BM25Encoder.default()
self.bm25_encoder = self.config.bm25_encoder if self.config.bm25_encoder else BM25Encoder.default()

# Call parent init here because embedder is needed
super().__init__(config=self.config)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "embedchain"
version = "0.1.86"
version = "0.1.87"
description = "Simplest open source retrieval(RAG) framework"
authors = [
"Taranjeet Singh <[email protected]>",
Expand Down

0 comments on commit 752f638

Please sign in to comment.