From 1390505665a8359a000b4879f0aed424a14c73e1 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Thu, 9 Oct 2025 23:36:27 +0300
Subject: [PATCH 01/11] Refactor SkipDetector to use a callable embedding
 function instead of SentenceTransformer; update requirements to remove
 unnecessary dependencies.

---
 README.md        |   5 +-
 memory_system.py | 179 +++++++++++++++++++++++------------------------
 requirements.txt |   4 +-
 3 files changed, 91 insertions(+), 97 deletions(-)

diff --git a/README.md b/README.md
index 7eaa34c..4423c8d 100644
--- a/README.md
+++ b/README.md
@@ -35,7 +35,7 @@ All prompts and logic work language-agnostically. Stores memories in English but
 Tested with Gemini 2.5 Flash Lite, GPT-4o-mini, Qwen2.5-Instruct, and Mistral-Small. Should work with any model that supports structured outputs.
 
 **Embedding Model Support**  
-Supports any sentence-transformers model. The default `gte-multilingual-base` works well for diverse languages and is efficient enough for real-time use. Make sure to tweak thresholds if you switch to a different model.
+Uses OpenWebUI's configured embedding model (supports Ollama, OpenAI, Azure OpenAI, and local sentence-transformers). Configure embedding models through OpenWebUI's RAG settings. The memory system automatically uses whatever embedding backend you've configured in OpenWebUI.
 
 ## How It Works
 
@@ -55,7 +55,6 @@ Supports any sentence-transformers model. The default `gte-multilingual-base` wo
 
 Customize behavior through valves:
 - **model**: LLM for consolidation and reranking (default: `gemini-2.5-flash-lite`)
-- **embedding_model**: Sentence transformer (default: `gte-multilingual-base`)
 - **max_memories_returned**: Context injection limit (default: 10)
 - **semantic_retrieval_threshold**: Minimum similarity score (default: 0.5)
 - **enable_llm_reranking**: Toggle smart reranking (default: true)
@@ -65,7 +64,7 @@ Customize behavior through valves:
 
 - Batched embedding generation for efficiency
 - Normalized embeddings for faster similarity computation
-- Cached embeddings prevent redundant model calls
+- Cached embeddings prevent redundant API calls to OpenWebUI's embedding backend
 - LRU eviction keeps memory footprint bounded
 - Fast-path skip detection for instant filtering
 - Selective LLM usage based on candidate count
diff --git a/memory_system.py b/memory_system.py
index 6c582a4..284a5ec 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -15,19 +15,15 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from pydantic import BaseModel, ConfigDict, Field, ValidationError as PydanticValidationError
-from sentence_transformers import SentenceTransformer
 
 from open_webui.utils.chat import generate_chat_completion
 from open_webui.models.users import Users
 from open_webui.routers.memories import Memories
 from fastapi import Request
 
-logging.getLogger("transformers").setLevel(logging.ERROR)
-logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
-
 logger = logging.getLogger("MemorySystem")
 
-_SHARED_MODEL_CACHE = {}
+_SHARED_SKIP_DETECTOR_CACHE = {}
 
 class Constants:
     """Centralized configuration constants for the memory system."""
@@ -65,7 +61,6 @@ class Constants:
     
     # Default Models
     DEFAULT_LLM_MODEL = "google/gemini-2.5-flash-lite"
-    DEFAULT_EMBEDDING_MODEL = "Alibaba-NLP/gte-multilingual-base"
 
 class Prompts:
     """Container for all LLM prompts used in the memory system."""
@@ -462,58 +457,46 @@ class SkipDetector:
         SkipReason.SKIP_GRAMMAR_PROOFREAD: "📝 Grammar/Proofreading Request Detected, skipping memory operations",
     }
 
-    def __init__(self, embedding_model: SentenceTransformer):
-        """Initialize the skip detector with an embedding model and compute reference embeddings."""
-        self.embedding_model = embedding_model
+    def __init__(self, embedding_function: Callable[[Union[str, List[str]]], Union[np.ndarray, List[np.ndarray]]]):
+        """Initialize the skip detector with an embedding function and compute reference embeddings."""
+        self.embedding_function = embedding_function
         self._reference_embeddings = None
         self._initialize_reference_embeddings()
     
     def _initialize_reference_embeddings(self) -> None:
         """Compute and cache embeddings for category descriptions."""
         try:
-            technical_embeddings = self.embedding_model.encode(
-                self.TECHNICAL_CATEGORY_DESCRIPTIONS, 
-                convert_to_tensor=True,
-                show_progress_bar=False
+            technical_embeddings = self.embedding_function(
+                self.TECHNICAL_CATEGORY_DESCRIPTIONS
             )
             
-            instruction_embeddings = self.embedding_model.encode(
-                self.INSTRUCTION_CATEGORY_DESCRIPTIONS,
-                convert_to_tensor=True,
-                show_progress_bar=False
+            instruction_embeddings = self.embedding_function(
+                self.INSTRUCTION_CATEGORY_DESCRIPTIONS
             )
             
-            pure_math_embeddings = self.embedding_model.encode(
-                self.PURE_MATH_CALCULATION_CATEGORY_DESCRIPTIONS,
-                convert_to_tensor=True,
-                show_progress_bar=False
+            pure_math_embeddings = self.embedding_function(
+                self.PURE_MATH_CALCULATION_CATEGORY_DESCRIPTIONS
             )
             
-            translation_embeddings = self.embedding_model.encode(
-                self.EXPLICIT_TRANSLATION_CATEGORY_DESCRIPTIONS,
-                convert_to_tensor=True,
-                show_progress_bar=False
+            translation_embeddings = self.embedding_function(
+                self.EXPLICIT_TRANSLATION_CATEGORY_DESCRIPTIONS
             )
             
-            grammar_embeddings = self.embedding_model.encode(
-                self.GRAMMAR_PROOFREADING_CATEGORY_DESCRIPTIONS,
-                convert_to_tensor=True,
-                show_progress_bar=False
+            grammar_embeddings = self.embedding_function(
+                self.GRAMMAR_PROOFREADING_CATEGORY_DESCRIPTIONS
             )
             
-            conversational_embeddings = self.embedding_model.encode(
-                self.CONVERSATIONAL_CATEGORY_DESCRIPTIONS,
-                convert_to_tensor=True,
-                show_progress_bar=False
+            conversational_embeddings = self.embedding_function(
+                self.CONVERSATIONAL_CATEGORY_DESCRIPTIONS
             )
             
             self._reference_embeddings = {
-                'technical': technical_embeddings,
-                'instruction': instruction_embeddings,
-                'pure_math': pure_math_embeddings,
-                'translation': translation_embeddings,
-                'grammar': grammar_embeddings,
-                'conversational': conversational_embeddings,
+                'technical': np.array(technical_embeddings),
+                'instruction': np.array(instruction_embeddings),
+                'pure_math': np.array(pure_math_embeddings),
+                'translation': np.array(translation_embeddings),
+                'grammar': np.array(grammar_embeddings),
+                'conversational': np.array(conversational_embeddings),
             }
             
             total_skip_categories = (
@@ -569,7 +552,6 @@ class SkipDetector:
                     parts = line[2:].split()
                     if parts and parts[0].isalnum():
                         actual_command_lines += 1
-                # Check for lines with embedded $ commands (e.g., "Run: $ command")
                 elif '$ ' in line:
                     dollar_index = line.find('$ ')
                     if dollar_index > 0 and line[dollar_index-1] in (' ', ':', '\t'):
@@ -583,7 +565,6 @@ class SkipDetector:
                 elif line.startswith('> ') and len(line) > 2:
                     pass
             
-            # Lowered threshold: even 1 command line with URL/pipe is technical
             if actual_command_lines >= 1 and any(c in message for c in ['http://', 'https://', ' | ']):
                 return self.SkipReason.SKIP_TECHNICAL.value
             if actual_command_lines >= 3:
@@ -602,23 +583,19 @@ class SkipDetector:
         if markup_chars >= 6:
             if markup_chars / msg_len > 0.10:
                 return self.SkipReason.SKIP_TECHNICAL.value
-            # Special check for JSON-like structures (many nested braces)
-            # Even with low density, if we have lots of curly braces, it's likely JSON
             curly_count = message.count('{') + message.count('}')
             if curly_count >= 10:
                 return self.SkipReason.SKIP_TECHNICAL.value
         
         # Pattern 7: Structured nested content with colons (key: value patterns)
         line_count = message.count('\n')
-        if line_count >= 8:  # At least 8 lines
+        if line_count >= 8:
             lines = message.split('\n')
             non_empty_lines = [line for line in lines if line.strip()]
             if non_empty_lines:
-                # Count lines with colon patterns (key: value or similar)
                 colon_lines = sum(1 for line in non_empty_lines if ':' in line and not line.strip().startswith('#'))
                 indented_lines = sum(1 for line in non_empty_lines if line.startswith((' ', '\t')))
                 
-                # If most lines have colons and indentation, it's structured data
                 if (colon_lines / len(non_empty_lines) > 0.4 and 
                     indented_lines / len(non_empty_lines) > 0.5):
                     return self.SkipReason.SKIP_TECHNICAL.value
@@ -631,7 +608,6 @@ class SkipDetector:
                 markup_in_lines = sum(1 for line in non_empty_lines if any(c in line for c in '{}[]<>'))
                 structured_lines = sum(1 for line in non_empty_lines if line.startswith((' ', '\t')))
                 
-                # Require high markup presence or indented structure with technical keywords
                 if markup_in_lines / len(non_empty_lines) > 0.3:
                     return self.SkipReason.SKIP_TECHNICAL.value
                 elif structured_lines / len(non_empty_lines) > 0.6:
@@ -684,18 +660,12 @@ class SkipDetector:
             return None
         
         try:
-            from sentence_transformers import util
+            message_embedding = np.array(self.embedding_function([message.strip()])[0])
             
-            message_embedding = self.embedding_model.encode(
-                message.strip(),
-                convert_to_tensor=True,
-                show_progress_bar=False
-            )
-            
-            conversational_similarities = util.cos_sim(
+            conversational_similarities = np.dot(
                 message_embedding, 
-                self._reference_embeddings['conversational']
-            )[0]
+                self._reference_embeddings['conversational'].T
+            )
             max_conversational_similarity = float(conversational_similarities.max())
             
             skip_categories = [
@@ -707,10 +677,10 @@ class SkipDetector:
             ]
             
             for cat_key, skip_reason, descriptions in skip_categories:
-                similarities = util.cos_sim(
+                similarities = np.dot(
                     message_embedding, 
-                    self._reference_embeddings[cat_key]
-                )[0]
+                    self._reference_embeddings[cat_key].T
+                )
                 max_similarity = float(similarities.max())
                 
                 if max_similarity > Constants.SKIP_DETECTION_SIMILARITY_THRESHOLD:
@@ -1069,7 +1039,6 @@ class Filter:
         """Configuration valves for the Memory System."""
 
         model: str = Field(default=Constants.DEFAULT_LLM_MODEL, description="Model name for LLM operations")
-        embedding_model: str = Field(default=Constants.DEFAULT_EMBEDDING_MODEL, description="Sentence transformer model for embeddings")
         max_memories_returned: int = Field(default=Constants.MAX_MEMORIES_PER_RETRIEVAL, description="Maximum number of memories to return in context")
         max_message_chars: int = Field(default=Constants.MAX_MESSAGE_CHARS, description="Maximum user message length before skipping memory operations")
         semantic_retrieval_threshold: float = Field(default=Constants.SEMANTIC_RETRIEVAL_THRESHOLD, description="Minimum similarity threshold for memory retrieval")
@@ -1079,7 +1048,7 @@ class Filter:
 
     def __init__(self):
         """Initialize the Memory System filter with production validation."""
-        global _SHARED_MODEL_CACHE
+        global _SHARED_SKIP_DETECTOR_CACHE
         
         self.valves = self.Valves()
         self._validate_system_configuration()
@@ -1088,21 +1057,8 @@ class Filter:
         self._background_tasks: set = set()
         self._shutdown_event = asyncio.Event()
 
-        model_key = self.valves.embedding_model
-        
-        if model_key in _SHARED_MODEL_CACHE:
-            logger.info(f"♻️ Reusing cached embedding model: {model_key}")
-            self._model = _SHARED_MODEL_CACHE[model_key]["model"]
-            self._skip_detector = _SHARED_MODEL_CACHE[model_key]["skip_detector"]
-        else:
-            logger.info(f"🤖 Loading embedding model: {model_key} (cache has {len(_SHARED_MODEL_CACHE)} models)")
-            self._model = SentenceTransformer(self.valves.embedding_model, device="auto", trust_remote_code=True)
-            self._skip_detector = SkipDetector(self._model)
-            _SHARED_MODEL_CACHE[model_key] = {
-                "model": self._model,
-                "skip_detector": self._skip_detector
-            }
-            logger.info(f"✅ Embedding model and skip detector initialized and cached")
+        self._embedding_function = None
+        self._skip_detector = None
 
         self._llm_reranking_service = LLMRerankingService(self)
         self._llm_consolidation_service = LLMConsolidationService(self)
@@ -1118,6 +1074,35 @@ class Filter:
             self.__model__ = __model__
         if __request__:
             self.__request__ = __request__
+            
+            if self._embedding_function is None and hasattr(__request__.app.state, 'EMBEDDING_FUNCTION'):
+                self._embedding_function = __request__.app.state.EMBEDDING_FUNCTION
+                logger.info(f"✅ Using OpenWebUI's embedding function")
+                
+                if self._skip_detector is None:
+                    global _SHARED_SKIP_DETECTOR_CACHE
+                    embedding_engine = getattr(__request__.app.state.config, 'RAG_EMBEDDING_ENGINE', '')
+                    embedding_model = getattr(__request__.app.state.config, 'RAG_EMBEDDING_MODEL', '')
+                    cache_key = f"{embedding_engine}:{embedding_model}"
+                    
+                    if cache_key in _SHARED_SKIP_DETECTOR_CACHE:
+                        logger.info(f"♻️ Reusing cached skip detector: {cache_key}")
+                        self._skip_detector = _SHARED_SKIP_DETECTOR_CACHE[cache_key]
+                    else:
+                        logger.info(f"🤖 Initializing skip detector with OpenWebUI embeddings: {cache_key}")
+                        embedding_fn = self._embedding_function
+                        def embedding_wrapper(texts: Union[str, List[str]]) -> Union[np.ndarray, List[np.ndarray]]:
+                            result = embedding_fn(texts, prefix=None, user=None)
+                            if isinstance(result, list):
+                                if isinstance(result[0], list):
+                                    return [np.array(emb, dtype=np.float16) for emb in result]
+                                return np.array(result, dtype=np.float16)
+                            return np.array(result, dtype=np.float16)
+                        
+                        self._skip_detector = SkipDetector(embedding_wrapper)
+                        _SHARED_SKIP_DETECTOR_CACHE[cache_key] = self._skip_detector
+                        logger.info(f"✅ Skip detector initialized and cached")
+
 
     def _truncate_content(self, content: str, max_length: Optional[int] = None) -> str:
         """Truncate content with ellipsis if needed."""
@@ -1169,24 +1154,20 @@ class Filter:
         """Compute SHA256 hash for text caching."""
         return hashlib.sha256(text.encode()).hexdigest()
 
-    def _normalize_embedding(self, embedding: np.ndarray) -> np.ndarray:
+    def _normalize_embedding(self, embedding: Union[List[float], np.ndarray]) -> np.ndarray:
         """Normalize embedding vector."""
-        embedding = embedding.astype(np.float16)
+        if isinstance(embedding, list):
+            embedding = np.array(embedding, dtype=np.float16)
+        else:
+            embedding = embedding.astype(np.float16)
         norm = np.linalg.norm(embedding)
         return embedding / norm if norm > 0 else embedding
 
-    def _generate_embeddings_sync(self, model, texts: Union[str, List[str]]) -> Union[np.ndarray, List[np.ndarray]]:
-        """Synchronous embedding generation for single text or batch."""
-        is_single = isinstance(texts, str)
-        input_texts = [texts] if is_single else texts
-
-        embeddings = model.encode(input_texts, convert_to_numpy=True, show_progress_bar=False)
-        normalized = [self._normalize_embedding(emb) for emb in embeddings]
-
-        return normalized[0] if is_single else normalized
-
     async def _generate_embeddings(self, texts: Union[str, List[str]], user_id: str) -> Union[np.ndarray, List[np.ndarray]]:
-        """Unified embedding generation for single text or batch with optimized caching."""
+        """Unified embedding generation for single text or batch with optimized caching using OpenWebUI's embedding function."""
+        if self._embedding_function is None:
+            raise RuntimeError("🤖 Embedding function not initialized. Ensure pipeline context is set.")
+            
         is_single = isinstance(texts, str)
         text_list = [texts] if is_single else texts
 
@@ -1219,8 +1200,24 @@ class Filter:
                 uncached_hashes.append(text_hash)
 
         if uncached_texts:
+            user = await asyncio.to_thread(Users.get_user_by_id, user_id) if hasattr(self, '__user__') else None
+            
             loop = asyncio.get_event_loop()
-            new_embeddings = await loop.run_in_executor(None, self._generate_embeddings_sync, self._model, uncached_texts)
+            raw_embeddings = await loop.run_in_executor(
+                None, 
+                self._embedding_function, 
+                uncached_texts, 
+                None, 
+                user
+            )
+            
+            if isinstance(raw_embeddings, list) and len(raw_embeddings) > 0:
+                if isinstance(raw_embeddings[0], list):
+                    new_embeddings = [self._normalize_embedding(emb) for emb in raw_embeddings]
+                else:
+                    new_embeddings = [self._normalize_embedding(raw_embeddings)]
+            else:
+                new_embeddings = [self._normalize_embedding(raw_embeddings)]
 
             for j, embedding in enumerate(new_embeddings):
                 original_idx = uncached_indices[j]
diff --git a/requirements.txt b/requirements.txt
index 7abc7cc..e8f6881 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,5 @@
 aiohttp>=3.12.15
 pydantic>=2.11.7
-sentence-transformers>=5.1.1
-torch>=2.8.0
-transformers>=4.57.0
+numpy>=2.0.0
 open-webui>=0.6.32
 tiktoken>=0.11.0

From 2db2d3f2c873314949078dcf16c0a439482906b9 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sun, 12 Oct 2025 21:44:51 +0300
Subject: [PATCH 02/11] Refactor SkipDetector to streamline skip detection
 logic and improve clarity; update method signature for better integration
 with memory system.

---
 memory_system.py | 67 ++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index 284a5ec..ec88f2b 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -47,11 +47,9 @@ class Constants:
     EXTENDED_MAX_MEMORY_MULTIPLIER = 1.5 # Multiplier for expanding memory candidates in advanced operations
     LLM_RERANKING_TRIGGER_MULTIPLIER = 0.5 # Multiplier for LLM reranking trigger threshold
     
-    # Skip Detection Thresholds
-    SKIP_DETECTION_SIMILARITY_THRESHOLD = 0.50  # Similarity threshold for skip category detection (tuned for zero-shot)
-    SKIP_DETECTION_MARGIN = 0.05  # Minimum margin required between skip and conversational similarity to skip
-    SKIP_DETECTION_CONFIDENT_MARGIN = 0.15  # Margin threshold for confident skips that trigger early exit
-    
+    # Skip Detection
+    SKIP_CATEGORY_MARGIN = 0.1  # Margin above conversational similarity for skip category classification
+
     # Safety & Operations
     MAX_DELETE_OPERATIONS_RATIO = 0.6  # Maximum delete operations ratio for safety
     MIN_OPS_FOR_DELETE_RATIO_CHECK = 6  # Minimum operations to apply ratio check
@@ -637,12 +635,11 @@ class SkipDetector:
         
         return None
 
-    def detect_skip_reason(self, message: str, max_message_chars: int = Constants.MAX_MESSAGE_CHARS) -> Optional[str]:
+    def detect_skip_reason(self, message: str, max_message_chars: int, memory_system: 'Filter') -> Optional[str]:
         """
         Detect if a message should be skipped using two-stage detection:
         1. Fast-path structural patterns (~95% confidence)
         2. Semantic classification (for remaining cases)
-        
         Returns:
             Skip reason string if content should be skipped, None otherwise
         """
@@ -676,6 +673,9 @@ class SkipDetector:
                 ('pure_math', self.SkipReason.SKIP_PURE_MATH, self.PURE_MATH_CALCULATION_CATEGORY_DESCRIPTIONS),
             ]
             
+            qualifying_categories = []
+            margin_threshold = max_conversational_similarity + Constants.SKIP_CATEGORY_MARGIN
+            
             for cat_key, skip_reason, descriptions in skip_categories:
                 similarities = np.dot(
                     message_embedding, 
@@ -683,16 +683,13 @@ class SkipDetector:
                 )
                 max_similarity = float(similarities.max())
                 
-                if max_similarity > Constants.SKIP_DETECTION_SIMILARITY_THRESHOLD:
-                    margin = max_similarity - max_conversational_similarity
-                    
-                    if margin > Constants.SKIP_DETECTION_CONFIDENT_MARGIN:
-                        logger.info(f"Skipping message - {skip_reason.value} ({cat_key}: {max_similarity:.3f}, conv: {max_conversational_similarity:.3f}, margin: {margin:.3f})")
-                        return skip_reason.value
-                    
-                    if margin > Constants.SKIP_DETECTION_MARGIN:                        
-                        logger.info(f"Skipping message - {skip_reason.value} ({cat_key}: {max_similarity:.3f}, conv: {max_conversational_similarity:.3f}, margin: {margin:.3f})")
-                        return skip_reason.value
+                if max_similarity > margin_threshold:
+                    qualifying_categories.append((max_similarity, cat_key, skip_reason))
+            
+            if qualifying_categories:
+                highest_similarity, highest_cat_key, highest_skip_reason = max(qualifying_categories, key=lambda x: x[0])
+                logger.info(f"🚫 Skipping message: {highest_skip_reason.value} (sim {highest_similarity:.3f} > conv {max_conversational_similarity:.3f} + {Constants.SKIP_CATEGORY_MARGIN:.3f})")
+                return highest_skip_reason.value
             
             return None
             
@@ -789,18 +786,25 @@ class LLMConsolidationService:
     def __init__(self, memory_system):
         self.memory_system = memory_system
 
+    def _filter_consolidation_candidates(self, similarities: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], str]:
+        """Filter consolidation candidates by threshold and return candidates with threshold info."""
+        consolidation_threshold = self.memory_system._get_retrieval_threshold(is_consolidation=True)
+        candidates = [mem for mem in similarities if mem["relevance"] >= consolidation_threshold]
+        
+        max_consolidation_memories = int(self.memory_system.valves.max_memories_returned * Constants.EXTENDED_MAX_MEMORY_MULTIPLIER)
+        candidates = candidates[:max_consolidation_memories]
+        
+        threshold_info = f"{consolidation_threshold:.3f} (max: {max_consolidation_memories})"
+        return candidates, threshold_info
+
     async def collect_consolidation_candidates(
         self, user_message: str, user_id: str, cached_similarities: Optional[List[Dict[str, Any]]] = None
     ) -> List[Dict[str, Any]]:
         """Collect candidate memories for consolidation analysis using cached or computed similarities."""
         if cached_similarities:
-            consolidation_threshold = self.memory_system._get_retrieval_threshold(is_consolidation=True)
-            candidates = [mem for mem in cached_similarities if mem["relevance"] >= consolidation_threshold]
-            
-            max_consolidation_memories = int(self.memory_system.valves.max_memories_returned * Constants.EXTENDED_MAX_MEMORY_MULTIPLIER)
-            candidates = candidates[:max_consolidation_memories]
+            candidates, threshold_info = self._filter_consolidation_candidates(cached_similarities)
 
-            logger.info(f"🎯 Found {len(candidates)} candidate memories for consolidation (threshold: {consolidation_threshold:.3f}, max: {max_consolidation_memories})")
+            logger.info(f"🎯 Found {len(candidates)} candidate memories for consolidation (threshold: {threshold_info})")
 
             self.memory_system._log_retrieved_memories(candidates, "consolidation")
             return candidates
@@ -826,13 +830,7 @@ class LLMConsolidationService:
             return []
 
         if all_similarities:
-            consolidation_threshold = self.memory_system._get_retrieval_threshold(is_consolidation=True)
-            candidates = [mem for mem in all_similarities if mem["relevance"] >= consolidation_threshold]
-            
-            max_consolidation_memories = int(self.memory_system.valves.max_memories_returned * Constants.EXTENDED_MAX_MEMORY_MULTIPLIER)
-            candidates = candidates[:max_consolidation_memories]
-            
-            threshold_info = f"{consolidation_threshold:.3f} (max: {max_consolidation_memories})"
+            candidates, threshold_info = self._filter_consolidation_candidates(all_similarities)
         else:
             candidates = []
             threshold_info = 'N/A'
@@ -1039,10 +1037,13 @@ class Filter:
         """Configuration valves for the Memory System."""
 
         model: str = Field(default=Constants.DEFAULT_LLM_MODEL, description="Model name for LLM operations")
-        max_memories_returned: int = Field(default=Constants.MAX_MEMORIES_PER_RETRIEVAL, description="Maximum number of memories to return in context")
+        
         max_message_chars: int = Field(default=Constants.MAX_MESSAGE_CHARS, description="Maximum user message length before skipping memory operations")
+        max_memories_returned: int = Field(default=Constants.MAX_MEMORIES_PER_RETRIEVAL, description="Maximum number of memories to return in context")
+        
         semantic_retrieval_threshold: float = Field(default=Constants.SEMANTIC_RETRIEVAL_THRESHOLD, description="Minimum similarity threshold for memory retrieval")
-        relaxed_semantic_threshold_multiplier: float = Field(default=Constants.RELAXED_SEMANTIC_THRESHOLD_MULTIPLIER, description="Adjusts similarity threshold for memory consolidation (lower = more candidates)")
+        relaxed_semantic_threshold_multiplier: float = Field(default=Constants.RELAXED_SEMANTIC_THRESHOLD_MULTIPLIER, description="Adjusts similarity threshold for memory consolidation (lower = more candidates)")        
+        
         enable_llm_reranking: bool = Field(default=True, description="Enable LLM-based memory reranking for improved contextual selection")
         llm_reranking_trigger_multiplier: float = Field(default=Constants.LLM_RERANKING_TRIGGER_MULTIPLIER, description="Controls when LLM reranking activates (lower = more aggressive)")
 
@@ -1239,7 +1240,7 @@ class Filter:
         if self._skip_detector is None:
             raise RuntimeError("🤖 Skip detector not initialized")
         
-        skip_reason = self._skip_detector.detect_skip_reason(user_message, self.valves.max_message_chars)
+        skip_reason = self._skip_detector.detect_skip_reason(user_message, self.valves.max_message_chars, memory_system=self)
         if skip_reason:
             status_key = SkipDetector.SkipReason(skip_reason)
             return True, SkipDetector.STATUS_MESSAGES[status_key]

From 158f0d1983c31a78db34fd31746580a7db9c2d34 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sun, 12 Oct 2025 22:54:18 +0300
Subject: [PATCH 03/11] Refactor memory operations in Filter class for improved
 readability and consistency; utilize statistics.median for score calculation
 and streamline operation details formatting.

---
 memory_system.py | 41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index ec88f2b..5e43cea 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -7,6 +7,7 @@ import asyncio
 import hashlib
 import json
 import logging
+import statistics
 import time
 from collections import OrderedDict
 from datetime import datetime, timezone
@@ -820,8 +821,8 @@ class LLMConsolidationService:
         if not user_memories:
             logger.info("💭 No existing memories found for consolidation")
             return []
-        else:
-            logger.info(f"🚀 Reusing cached user memories for consolidation: {len(user_memories)} memories")
+
+        logger.info(f"🚀 Reusing cached user memories for consolidation: {len(user_memories)} memories")
 
         try:
             all_similarities, _, _ = await self.memory_system._compute_similarities(user_message, user_id, user_memories)
@@ -1293,7 +1294,7 @@ class Filter:
 
         top_score = max(scores)
         lowest_score = min(scores)
-        median_score = sorted(scores)[len(scores) // 2]
+        median_score = statistics.median(scores)
 
         context_label = "📊 Consolidation candidate memories" if context_type == "consolidation" else "📊 Retrieved memories"
         max_scores_to_show = int(self.valves.max_memories_returned * Constants.EXTENDED_MAX_MEMORY_MULTIPLIER)
@@ -1304,16 +1305,8 @@ class Filter:
         logger.info(f"Scores: [{scores_str}{suffix}]")
 
     def _build_operation_details(self, created_count: int, updated_count: int, deleted_count: int) -> List[str]:
-        """Build operation details list with consistent formatting."""
-        operation_details = []
-
         operations = [(created_count, "📝 Created"), (updated_count, "✏️ Updated"), (deleted_count, "🗑️ Deleted")]
-
-        for count, label in operations:
-            if count > 0:
-                operation_details.append(f"{label} {count}")
-
-        return operation_details
+        return [f"{label} {count}" for count, label in operations if count > 0]
 
     def _cache_key(self, cache_type: str, user_id: str, content: Optional[str] = None) -> str:
         """Unified cache key generation for all cache types."""
@@ -1522,8 +1515,7 @@ class Filter:
 
             if user_memories is None:
                 user_memories = await self._get_user_memories(user_id)
-                if user_memories:
-                    await self._cache_manager.put(user_id, self._cache_manager.MEMORY_CACHE, memory_cache_key, user_memories)
+                await self._cache_manager.put(user_id, self._cache_manager.MEMORY_CACHE, memory_cache_key, user_memories)
 
             retrieval_result = await self._retrieve_relevant_memories(user_message, user_id, user_memories, __event_emitter__)
             memories = retrieval_result.get("memories", [])
@@ -1632,36 +1624,41 @@ class Filter:
         """Execute a single memory operation."""
         try:
             if operation.operation == Models.MemoryOperationType.CREATE:
-                if not operation.content.strip():
+                content_stripped = operation.content.strip()
+                if not content_stripped:
                     logger.warning(f"⚠️ Skipping CREATE operation: empty content")
                     return Models.OperationResult.SKIPPED_EMPTY_CONTENT.value
 
                 await asyncio.wait_for(
-                    asyncio.to_thread(Memories.insert_new_memory, user.id, operation.content.strip()), timeout=Constants.DATABASE_OPERATION_TIMEOUT_SEC
+                    asyncio.to_thread(Memories.insert_new_memory, user.id, content_stripped), timeout=Constants.DATABASE_OPERATION_TIMEOUT_SEC
                 )
                 return Models.MemoryOperationType.CREATE.value
 
             elif operation.operation == Models.MemoryOperationType.UPDATE:
-                if not operation.id.strip():
+                id_stripped = operation.id.strip()
+                if not id_stripped:
                     logger.warning(f"⚠️ Skipping UPDATE operation: empty ID")
                     return Models.OperationResult.SKIPPED_EMPTY_ID.value
-                if not operation.content.strip():
-                    logger.warning(f"⚠️ Skipping UPDATE operation for {operation.id}: empty content")
+                
+                content_stripped = operation.content.strip()
+                if not content_stripped:
+                    logger.warning(f"⚠️ Skipping UPDATE operation for {id_stripped}: empty content")
                     return Models.OperationResult.SKIPPED_EMPTY_CONTENT.value
 
                 await asyncio.wait_for(
-                    asyncio.to_thread(Memories.update_memory_by_id_and_user_id, operation.id, user.id, operation.content.strip()),
+                    asyncio.to_thread(Memories.update_memory_by_id_and_user_id, id_stripped, user.id, content_stripped),
                     timeout=Constants.DATABASE_OPERATION_TIMEOUT_SEC,
                 )
                 return Models.MemoryOperationType.UPDATE.value
 
             elif operation.operation == Models.MemoryOperationType.DELETE:
-                if not operation.id.strip():
+                id_stripped = operation.id.strip()
+                if not id_stripped:
                     logger.warning(f"⚠️ Skipping DELETE operation: empty ID")
                     return Models.OperationResult.SKIPPED_EMPTY_ID.value
 
                 await asyncio.wait_for(
-                    asyncio.to_thread(Memories.delete_memory_by_id_and_user_id, operation.id, user.id), timeout=Constants.DATABASE_OPERATION_TIMEOUT_SEC
+                    asyncio.to_thread(Memories.delete_memory_by_id_and_user_id, id_stripped, user.id), timeout=Constants.DATABASE_OPERATION_TIMEOUT_SEC
                 )
                 return Models.MemoryOperationType.DELETE.value
             else:

From 849dd71a01837716593bc4f92746c00d3d08880b Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sun, 12 Oct 2025 23:03:36 +0300
Subject: [PATCH 04/11] Refactor memory selection logic in LLMRerankingService
 for improved clarity; streamline response handling by directly using
 response.ids.

---
 memory_system.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index 5e43cea..e04424d 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -730,11 +730,9 @@ CANDIDATE MEMORIES:
         try:
             response = await self.memory_system._query_llm(Prompts.MEMORY_RERANKING, user_prompt, response_model=Models.MemoryRerankingResponse)
 
-            selected_ids = response.ids
-
             selected_memories = []
             for memory in candidate_memories:
-                if memory["id"] in selected_ids and len(selected_memories) < max_count:
+                if memory["id"] in response.ids and len(selected_memories) < max_count:
                     selected_memories.append(memory)
 
             logger.info(f"🧠 LLM selected {len(selected_memories)} out of {len(candidate_memories)} candidates")
@@ -777,7 +775,6 @@ CANDIDATE MEMORIES:
         duration_text = f" in {duration:.2f}s" if duration >= 0.01 else ""
         retrieval_method = "LLM" if should_use_llm else "Semantic"
         await self.memory_system._emit_status(emitter, f"🎯 {retrieval_method} Memory Retrieval Complete{duration_text}", done=True)
-        logger.info(f"🎯 {retrieval_method} Memory Retrieval Complete{duration_text}")
         return selected_memories, analysis_info
 
 

From 2deba4fb2cc3d3de41807110eac9bf0b741c1645 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sun, 12 Oct 2025 23:24:58 +0300
Subject: [PATCH 05/11] Refactor Filter class to use async for pipeline context
 setup; implement locking mechanism for shared skip detector cache to enhance
 concurrency safety.

---
 memory_system.py | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index e04424d..0b32807 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -25,6 +25,7 @@ from fastapi import Request
 logger = logging.getLogger("MemorySystem")
 
 _SHARED_SKIP_DETECTOR_CACHE = {}
+_SHARED_SKIP_DETECTOR_CACHE_LOCK = asyncio.Lock()
 
 class Constants:
     """Centralized configuration constants for the memory system."""
@@ -1062,7 +1063,7 @@ class Filter:
         self._llm_reranking_service = LLMRerankingService(self)
         self._llm_consolidation_service = LLMConsolidationService(self)
 
-    def _set_pipeline_context(self, __event_emitter__: Optional[Callable] = None, __user__: Optional[Dict[str, Any]] = None, 
+    async def _set_pipeline_context(self, __event_emitter__: Optional[Callable] = None, __user__: Optional[Dict[str, Any]] = None, 
                              __model__: Optional[str] = None, __request__: Optional[Request] = None) -> None:
         """Set pipeline context parameters to avoid duplication in inlet/outlet methods."""
         if __event_emitter__:
@@ -1079,28 +1080,29 @@ class Filter:
                 logger.info(f"✅ Using OpenWebUI's embedding function")
                 
                 if self._skip_detector is None:
-                    global _SHARED_SKIP_DETECTOR_CACHE
+                    global _SHARED_SKIP_DETECTOR_CACHE, _SHARED_SKIP_DETECTOR_CACHE_LOCK
                     embedding_engine = getattr(__request__.app.state.config, 'RAG_EMBEDDING_ENGINE', '')
                     embedding_model = getattr(__request__.app.state.config, 'RAG_EMBEDDING_MODEL', '')
                     cache_key = f"{embedding_engine}:{embedding_model}"
                     
-                    if cache_key in _SHARED_SKIP_DETECTOR_CACHE:
-                        logger.info(f"♻️ Reusing cached skip detector: {cache_key}")
-                        self._skip_detector = _SHARED_SKIP_DETECTOR_CACHE[cache_key]
-                    else:
-                        logger.info(f"🤖 Initializing skip detector with OpenWebUI embeddings: {cache_key}")
-                        embedding_fn = self._embedding_function
-                        def embedding_wrapper(texts: Union[str, List[str]]) -> Union[np.ndarray, List[np.ndarray]]:
-                            result = embedding_fn(texts, prefix=None, user=None)
-                            if isinstance(result, list):
-                                if isinstance(result[0], list):
-                                    return [np.array(emb, dtype=np.float16) for emb in result]
+                    async with _SHARED_SKIP_DETECTOR_CACHE_LOCK:
+                        if cache_key in _SHARED_SKIP_DETECTOR_CACHE:
+                            logger.info(f"♻️ Reusing cached skip detector: {cache_key}")
+                            self._skip_detector = _SHARED_SKIP_DETECTOR_CACHE[cache_key]
+                        else:
+                            logger.info(f"🤖 Initializing skip detector with OpenWebUI embeddings: {cache_key}")
+                            embedding_fn = self._embedding_function
+                            def embedding_wrapper(texts: Union[str, List[str]]) -> Union[np.ndarray, List[np.ndarray]]:
+                                result = embedding_fn(texts, prefix=None, user=None)
+                                if isinstance(result, list):
+                                    if isinstance(result[0], list):
+                                        return [np.array(emb, dtype=np.float16) for emb in result]
+                                    return np.array(result, dtype=np.float16)
                                 return np.array(result, dtype=np.float16)
-                            return np.array(result, dtype=np.float16)
-                        
-                        self._skip_detector = SkipDetector(embedding_wrapper)
-                        _SHARED_SKIP_DETECTOR_CACHE[cache_key] = self._skip_detector
-                        logger.info(f"✅ Skip detector initialized and cached")
+                            
+                            self._skip_detector = SkipDetector(embedding_wrapper)
+                            _SHARED_SKIP_DETECTOR_CACHE[cache_key] = self._skip_detector
+                            logger.info(f"✅ Skip detector initialized and cached")
 
 
     def _truncate_content(self, content: str, max_length: Optional[int] = None) -> str:
@@ -1492,7 +1494,7 @@ class Filter:
         **kwargs,
     ) -> Dict[str, Any]:
         """Simplified inlet processing for memory retrieval and injection."""
-        self._set_pipeline_context(__event_emitter__, __user__, __model__, __request__)
+        await self._set_pipeline_context(__event_emitter__, __user__, __model__, __request__)
 
         user_id = __user__.get("id") if body and __user__ else None
         if not user_id:
@@ -1540,7 +1542,7 @@ class Filter:
         **kwargs,
     ) -> dict:
         """Simplified outlet processing for background memory consolidation."""
-        self._set_pipeline_context(__event_emitter__, __user__, __model__, __request__)
+        await self._set_pipeline_context(__event_emitter__, __user__, __model__, __request__)
 
         user_id = __user__.get("id") if body and __user__ else None
         if not user_id:

From e3709fe677b31436d07cdc80d7ff7bd00f099fa8 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Wed, 15 Oct 2025 14:05:01 +0300
Subject: [PATCH 06/11] Refactor cache management in Filter class; reduce
 maximum cache entries and concurrent user caches for improved performance and
 clarity. Update cache management methods for consistency and better logging.

---
 memory_system.py | 43 ++++++++++---------------------------------
 1 file changed, 10 insertions(+), 33 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index 0b32807..166de0d 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -39,8 +39,8 @@ class Constants:
     LLM_CONSOLIDATION_TIMEOUT_SEC = 60.0  # Timeout for LLM consolidation operations
     
     # Cache System
-    MAX_CACHE_ENTRIES_PER_TYPE = 2500  # Maximum cache entries per cache type
-    MAX_CONCURRENT_USER_CACHES = 250  # Maximum concurrent user cache instances
+    MAX_CACHE_ENTRIES_PER_TYPE = 500  # Maximum cache entries per cache type
+    MAX_CONCURRENT_USER_CACHES = 50  # Maximum concurrent user cache instances
     CACHE_KEY_HASH_PREFIX_LENGTH = 10  # Hash prefix length for cache keys
     
     # Retrieval & Similarity
@@ -313,26 +313,6 @@ class UnifiedCacheManager:
         async with self._lock:
             self.caches.clear()
 
-    async def get_cache_stats(self) -> Dict[str, Any]:
-        """Get cache statistics for monitoring."""
-        async with self._lock:
-            total_users = len(self.caches)
-            total_items = 0
-            cache_type_counts = {}
-
-            for user_id, user_cache in self.caches.items():
-                for cache_type, type_cache in user_cache.items():
-                    cache_type_counts[cache_type] = cache_type_counts.get(cache_type, 0) + len(type_cache)
-                    total_items += len(type_cache)
-
-            return {
-                "total_users": total_users,
-                "total_items": total_items,
-                "cache_type_counts": cache_type_counts,
-                "max_users": self.max_users,
-                "max_cache_size_per_type": self.max_cache_size_per_type,
-            }
-
 
 class SkipDetector:
     """Semantic-based content classifier using zero-shot classification with category descriptions."""
@@ -979,7 +959,7 @@ class LLMConsolidationService:
         if total_executed > 0:
             operation_details = self.memory_system._build_operation_details(created_count, updated_count, deleted_count)
             logger.info(f"🔄 Memory Operations: {', '.join(operation_details)}")
-            await self.memory_system._manage_user_cache(user_id)
+            await self.memory_system._refresh_user_cache(user_id)
 
         return created_count, updated_count, deleted_count, failed_count
 
@@ -1584,16 +1564,13 @@ class Filter:
 
         await self._cache_manager.clear_all_caches()
 
-    async def _manage_user_cache(self, user_id: str, clear_first: bool = False) -> None:
-        """Manage user cache - clear, invalidate, and refresh as needed."""
+    async def _refresh_user_cache(self, user_id: str) -> None:
+        """Refresh user cache - clear stale caches and update with fresh embeddings."""
         start_time = time.time()
         try:
-            if clear_first:
-                total_removed = await self._cache_manager.clear_user_cache(user_id)
-                logger.info(f"🧹 Cleared {total_removed} cache entries for user {user_id}")
-            else:
-                retrieval_cleared = await self._cache_manager.clear_user_cache(user_id, self._cache_manager.RETRIEVAL_CACHE)
-                logger.info(f"🔄 Cleared {retrieval_cleared} retrieval cache entries for user {user_id}")
+            retrieval_cleared = await self._cache_manager.clear_user_cache(user_id, self._cache_manager.RETRIEVAL_CACHE)
+            embedding_cleared = await self._cache_manager.clear_user_cache(user_id, self._cache_manager.EMBEDDING_CACHE)
+            logger.info(f"🔄 Cleared {retrieval_cleared} retrieval + {embedding_cleared} embedding cache entries for user {user_id}")
 
             user_memories = await self._get_user_memories(user_id)
             memory_cache_key = self._cache_key(self._cache_manager.MEMORY_CACHE, user_id)
@@ -1614,10 +1591,10 @@ class Filter:
             if memory_contents:
                 await self._generate_embeddings(memory_contents, user_id)
                 duration = time.time() - start_time
-                logger.info(f"� Cache updated with {len(memory_contents)} embeddings for user {user_id} in {duration:.2f}s")
+                logger.info(f"🔄 Cache updated with {len(memory_contents)} embeddings for user {user_id} in {duration:.2f}s")
 
         except Exception as e:
-            raise RuntimeError(f"🧹 Failed to manage cache for user {user_id} after {(time.time() - start_time):.2f}s: {str(e)}")
+            raise RuntimeError(f"🧹 Failed to refresh cache for user {user_id} after {(time.time() - start_time):.2f}s: {str(e)}")
 
     async def _execute_single_operation(self, operation: Models.MemoryOperation, user: Any) -> str:
         """Execute a single memory operation."""

From 0726293446cfbacc4d43211cc022bde3b4e281c1 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Wed, 15 Oct 2025 14:13:55 +0300
Subject: [PATCH 07/11] Update README.md for improved clarity and accuracy;
 revise privacy notice, cache descriptions, and model support details.

---
 README.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 4423c8d..eb24629 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,10 @@
 
 A long-term memory system that learns from conversations and personalizes responses without requiring external APIs or tokens.
 
+## Important Notice
+
+**Privacy Consideration:** This system shares user messages and stored memories with your configured LLM for memory consolidation and retrieval operations. All data is processed through Open WebUI's built-in models using your existing configuration. No data is sent to external services beyond what your LLM provider configuration already allows.
+
 ## Core Features
 
 **Zero External Dependencies**  
@@ -21,7 +25,7 @@ Avoids wasting resources on irrelevant messages through two-stage detection:
 Categories automatically skipped: technical discussions, formatting requests, calculations, translation tasks, proofreading, and non-personal queries.
 
 **Multi-Layer Caching**  
-Three specialized caches (embeddings, retrieval results, memory lookups) with LRU eviction keep responses fast while managing memory efficiently. Each user gets isolated cache storage.
+Three specialized caches (embeddings, retrieval, memory) with LRU eviction keep responses fast while managing memory efficiently. Each user gets isolated cache storage.
 
 **Real-Time Status Updates**  
 Emits progress messages during operations: memory retrieval progress, consolidation status, operation summaries — keeping users informed without overwhelming them.
@@ -32,7 +36,7 @@ All prompts and logic work language-agnostically. Stores memories in English but
 ## Model Support
 
 **LLM Support**  
-Tested with Gemini 2.5 Flash Lite, GPT-4o-mini, Qwen2.5-Instruct, and Mistral-Small. Should work with any model that supports structured outputs.
+Tested with gemini-2.5-flash-lite, gpt-5-nano, and qwen3-instruct. Should work with any model that supports structured outputs.
 
 **Embedding Model Support**  
 Uses OpenWebUI's configured embedding model (supports Ollama, OpenAI, Azure OpenAI, and local sentence-transformers). Configure embedding models through OpenWebUI's RAG settings. The memory system automatically uses whatever embedding backend you've configured in OpenWebUI.
@@ -54,11 +58,13 @@ Uses OpenWebUI's configured embedding model (supports Ollama, OpenAI, Azure Open
 ## Configuration
 
 Customize behavior through valves:
-- **model**: LLM for consolidation and reranking (default: `gemini-2.5-flash-lite`)
+- **model**: LLM for consolidation and reranking (default: `google/gemini-2.5-flash-lite`)
+- **max_message_chars**: Maximum message length before skipping operations (default: 2500)
 - **max_memories_returned**: Context injection limit (default: 10)
 - **semantic_retrieval_threshold**: Minimum similarity score (default: 0.5)
+- **relaxed_semantic_threshold_multiplier**: Adjusts threshold for consolidation (default: 0.9)
 - **enable_llm_reranking**: Toggle smart reranking (default: true)
-- **llm_reranking_trigger_multiplier**: When to activate LLM (default: 0.5 = 50%)
+- **llm_reranking_trigger_multiplier**: When to activate LLM reranking (default: 0.5 = 50%)
 
 ## Performance Optimizations
 

From 505c4430506818135d66d8403242a160e5d66d35 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Wed, 15 Oct 2025 14:33:33 +0300
Subject: [PATCH 08/11] Update README.md to enhance clarity on privacy and cost
 considerations; restructure sections for better readability and add relevant
 details.

---
 README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eb24629..c32a3ee 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,17 @@
 
 A long-term memory system that learns from conversations and personalizes responses without requiring external APIs or tokens.
 
-## Important Notice
+## ⚠️ Important Notices
 
-**Privacy Consideration:** This system shares user messages and stored memories with your configured LLM for memory consolidation and retrieval operations. All data is processed through Open WebUI's built-in models using your existing configuration. No data is sent to external services beyond what your LLM provider configuration already allows.
+**🔒 Privacy & Data Sharing:**
+- User messages and stored memories are shared with your configured LLM for memory consolidation and retrieval
+- If using remote embedding models (like OpenAI text-embedding-3-small), memories will also be sent to those external providers
+- All data is processed through Open WebUI's built-in models using your existing configuration
+
+**💰 Cost & Model Requirements:**
+- The system uses complex prompts and sends relevant memories to the LLM, which increase token usage and costs
+- Requires public models configured in OpenWebUI - you can use any public model ID from your instance
+- **Recommended cost-effective models:** `gpt-5-nano`, `gemini-2.5-flash-lite`, `qwen3-instruct`, or your local LLMs
 
 ## Core Features
 

From 7e2209633deb66ec263caef35ce7a4e1fd482482 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sat, 18 Oct 2025 19:25:22 +0300
Subject: [PATCH 09/11] Refactor logger initialization in memory_system.py to
 use module name for better context in log messages.

---
 memory_system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memory_system.py b/memory_system.py
index 166de0d..3e6a75b 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -22,7 +22,7 @@ from open_webui.models.users import Users
 from open_webui.routers.memories import Memories
 from fastapi import Request
 
-logger = logging.getLogger("MemorySystem")
+logger = logging.getLogger(__name__)
 
 _SHARED_SKIP_DETECTOR_CACHE = {}
 _SHARED_SKIP_DETECTOR_CACHE_LOCK = asyncio.Lock()

From d05ed8a16ea5f3d2634bd80ed03ee4f0a04f4861 Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sat, 18 Oct 2025 20:31:04 +0300
Subject: [PATCH 10/11] Update semantic retrieval thresholds in Constants class
 for improved accuracy

---
 memory_system.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/memory_system.py b/memory_system.py
index 3e6a75b..3fdc595 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -44,13 +44,13 @@ class Constants:
     CACHE_KEY_HASH_PREFIX_LENGTH = 10  # Hash prefix length for cache keys
     
     # Retrieval & Similarity
-    SEMANTIC_RETRIEVAL_THRESHOLD = 0.5 # Semantic similarity threshold for retrieval
-    RELAXED_SEMANTIC_THRESHOLD_MULTIPLIER = 0.9 # Multiplier for relaxed similarity threshold in secondary operations
-    EXTENDED_MAX_MEMORY_MULTIPLIER = 1.5 # Multiplier for expanding memory candidates in advanced operations
-    LLM_RERANKING_TRIGGER_MULTIPLIER = 0.5 # Multiplier for LLM reranking trigger threshold
+    SEMANTIC_RETRIEVAL_THRESHOLD = 0.25 # Semantic similarity threshold for retrieval
+    RELAXED_SEMANTIC_THRESHOLD_MULTIPLIER = 0.8 # Multiplier for relaxed similarity threshold in secondary operations
+    EXTENDED_MAX_MEMORY_MULTIPLIER = 1.6 # Multiplier for expanding memory candidates in advanced operations
+    LLM_RERANKING_TRIGGER_MULTIPLIER = 0.8 # Multiplier for LLM reranking trigger threshold
     
     # Skip Detection
-    SKIP_CATEGORY_MARGIN = 0.1  # Margin above conversational similarity for skip category classification
+    SKIP_CATEGORY_MARGIN = 0.5  # Margin above conversational similarity for skip category classification
 
     # Safety & Operations
     MAX_DELETE_OPERATIONS_RATIO = 0.6  # Maximum delete operations ratio for safety

From c0bfb3927bc956896fb2faa37c68fc552c3027fc Mon Sep 17 00:00:00 2001
From: mtayfur <mt.tayfur@gmail.com>
Date: Sun, 19 Oct 2025 05:13:45 +0300
Subject: [PATCH 11/11] Refactor memory creation guidelines for improved
 clarity and conciseness in contextual completeness section.

---
 memory_system.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memory_system.py b/memory_system.py
index 3fdc595..60d09ec 100644
--- a/memory_system.py
+++ b/memory_system.py
@@ -88,7 +88,7 @@ Build precise memories of the user's personal narrative with factual, temporal s
     - Retroactive Enrichment: If a name is provided for prior entity, UPDATE only if substantially valuable.
 - Ensure Memory Quality:
     - High Bar for Creation: Only CREATE memories for significant life facts, relationships, events, or core personal attributes. Skip trivial details or passing interests.
-    - Contextual Completeness: Create memories that combine related information into cohesive statements. When multiple facts share connections (same topic, person, event, or timeframe), group them into a single memory rather than fragmenting. Include relevant supporting details that help understand the core fact while respecting boundaries. Only combine facts that are directly related and belong together naturally. Avoid bare statements lacking context and never merge unrelated information.
+    - Contextual Completeness: Combine related information into cohesive statements. Group connected facts (same topic, person, event, or timeframe) into single memories rather than fragmenting. Include supporting details while respecting boundaries. Only combine directly related facts. Avoid bare statements and never merge unrelated information.
     - Mandatory Semantic Enhancement: Enhance entities with descriptive categorical nouns for better retrieval.
     - Verify Nouns/Pronouns: Link pronouns (he, she, they) and nouns to specific entities.
     - First-Person Format: Write all memories in English from the user's perspective.