mirror of
https://github.com/mtayfur/openwebui-memory-system.git
synced 2026-01-22 15:01:02 +01:00
Merge branch 'dev' of https://github.com/GlisseManTV/openwebui-memory-system into dev
This commit is contained in:
27
README.md
27
README.md
@@ -2,6 +2,18 @@
|
||||
|
||||
A long-term memory system that learns from conversations and personalizes responses without requiring external APIs or tokens.
|
||||
|
||||
## ⚠️ Important Notices
|
||||
|
||||
**🔒 Privacy & Data Sharing:**
|
||||
- User messages and stored memories are shared with your configured LLM for memory consolidation and retrieval
|
||||
- If using remote embedding models (like OpenAI text-embedding-3-small), memories will also be sent to those external providers
|
||||
- All data is processed through Open WebUI's built-in models using your existing configuration
|
||||
|
||||
**💰 Cost & Model Requirements:**
|
||||
- The system uses complex prompts and sends relevant memories to the LLM, which increase token usage and costs
|
||||
- Requires public models configured in OpenWebUI - you can use any public model ID from your instance
|
||||
- **Recommended cost-effective models:** `gpt-5-nano`, `gemini-2.5-flash-lite`, `qwen3-instruct`, or your local LLMs
|
||||
|
||||
## Core Features
|
||||
|
||||
**Zero External Dependencies**
|
||||
@@ -21,7 +33,7 @@ Avoids wasting resources on irrelevant messages through two-stage detection:
|
||||
Categories automatically skipped: technical discussions, formatting requests, calculations, translation tasks, proofreading, and non-personal queries.
|
||||
|
||||
**Multi-Layer Caching**
|
||||
Three specialized caches (embeddings, retrieval results, memory lookups) with LRU eviction keep responses fast while managing memory efficiently. Each user gets isolated cache storage.
|
||||
Three specialized caches (embeddings, retrieval, memory) with LRU eviction keep responses fast while managing memory efficiently. Each user gets isolated cache storage.
|
||||
|
||||
**Real-Time Status Updates**
|
||||
Emits progress messages during operations: memory retrieval progress, consolidation status, operation summaries — keeping users informed without overwhelming them.
|
||||
@@ -32,10 +44,10 @@ All prompts and logic work language-agnostically. Stores memories in English but
|
||||
## Model Support
|
||||
|
||||
**LLM Support**
|
||||
Tested with Gemini 2.5 Flash Lite, GPT-4o-mini, Qwen2.5-Instruct, and Mistral-Small. Should work with any model that supports structured outputs.
|
||||
Tested with gemini-2.5-flash-lite, gpt-5-nano, and qwen3-instruct. Should work with any model that supports structured outputs.
|
||||
|
||||
**Embedding Model Support**
|
||||
Supports any sentence-transformers model. The default `gte-multilingual-base` works well for diverse languages and is efficient enough for real-time use. Make sure to tweak thresholds if you switch to a different model.
|
||||
Uses OpenWebUI's configured embedding model (supports Ollama, OpenAI, Azure OpenAI, and local sentence-transformers). Configure embedding models through OpenWebUI's RAG settings. The memory system automatically uses whatever embedding backend you've configured in OpenWebUI.
|
||||
|
||||
## How It Works
|
||||
|
||||
@@ -54,18 +66,19 @@ Supports any sentence-transformers model. The default `gte-multilingual-base` wo
|
||||
## Configuration
|
||||
|
||||
Customize behavior through valves:
|
||||
- **model**: LLM for consolidation and reranking (default: `gemini-2.5-flash-lite`)
|
||||
- **embedding_model**: Sentence transformer (default: `gte-multilingual-base`)
|
||||
- **model**: LLM for consolidation and reranking (default: `google/gemini-2.5-flash-lite`)
|
||||
- **max_message_chars**: Maximum message length before skipping operations (default: 2500)
|
||||
- **max_memories_returned**: Context injection limit (default: 10)
|
||||
- **semantic_retrieval_threshold**: Minimum similarity score (default: 0.5)
|
||||
- **relaxed_semantic_threshold_multiplier**: Adjusts threshold for consolidation (default: 0.9)
|
||||
- **enable_llm_reranking**: Toggle smart reranking (default: true)
|
||||
- **llm_reranking_trigger_multiplier**: When to activate LLM (default: 0.5 = 50%)
|
||||
- **llm_reranking_trigger_multiplier**: When to activate LLM reranking (default: 0.5 = 50%)
|
||||
|
||||
## Performance Optimizations
|
||||
|
||||
- Batched embedding generation for efficiency
|
||||
- Normalized embeddings for faster similarity computation
|
||||
- Cached embeddings prevent redundant model calls
|
||||
- Cached embeddings prevent redundant API calls to OpenWebUI's embedding backend
|
||||
- LRU eviction keeps memory footprint bounded
|
||||
- Fast-path skip detection for instant filtering
|
||||
- Selective LLM usage based on candidate count
|
||||
|
||||
@@ -8,6 +8,7 @@ import hashlib
|
||||
import json
|
||||
import logging
|
||||
import statistics
|
||||
import statistics
|
||||
import time
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime, timezone
|
||||
@@ -27,6 +28,7 @@ from open_webui.models.users import Users
|
||||
from open_webui.routers.memories import Memories
|
||||
from fastapi import Request
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SHARED_SKIP_DETECTOR_CACHE = {}
|
||||
@@ -47,6 +49,8 @@ class Constants:
|
||||
# Cache System
|
||||
MAX_CACHE_ENTRIES_PER_TYPE = 500 # Maximum cache entries per cache type
|
||||
MAX_CONCURRENT_USER_CACHES = 50 # Maximum concurrent user cache instances
|
||||
MAX_CACHE_ENTRIES_PER_TYPE = 500 # Maximum cache entries per cache type
|
||||
MAX_CONCURRENT_USER_CACHES = 50 # Maximum concurrent user cache instances
|
||||
CACHE_KEY_HASH_PREFIX_LENGTH = 10 # Hash prefix length for cache keys
|
||||
|
||||
# Retrieval & Similarity
|
||||
@@ -104,6 +108,7 @@ Build precise memories of the user's personal narrative with factual, temporal s
|
||||
- Ensure Memory Quality:
|
||||
- High Bar for Creation: Only CREATE memories for significant life facts, relationships, events, or core personal attributes. Skip trivial details or passing interests.
|
||||
- Contextual Completeness: Combine related information into cohesive statements. Group connected facts (same topic, person, event, or timeframe) into single memories rather than fragmenting. Include supporting details while respecting boundaries. Only combine directly related facts. Avoid bare statements and never merge unrelated information.
|
||||
- Contextual Completeness: Combine related information into cohesive statements. Group connected facts (same topic, person, event, or timeframe) into single memories rather than fragmenting. Include supporting details while respecting boundaries. Only combine directly related facts. Avoid bare statements and never merge unrelated information.
|
||||
- Mandatory Semantic Enhancement: Enhance entities with descriptive categorical nouns for better retrieval.
|
||||
- Verify Nouns/Pronouns: Link pronouns (he, she, they) and nouns to specific entities.
|
||||
- First-Person Format: Write all memories in English from the user's perspective.
|
||||
@@ -488,6 +493,8 @@ class SkipDetector:
|
||||
def _initialize_reference_embeddings(self) -> None:
|
||||
"""Compute and cache embeddings for category descriptions."""
|
||||
try:
|
||||
technical_embeddings = self.embedding_function(
|
||||
self.TECHNICAL_CATEGORY_DESCRIPTIONS
|
||||
technical_embeddings = self.embedding_function(
|
||||
self.TECHNICAL_CATEGORY_DESCRIPTIONS
|
||||
)
|
||||
@@ -862,6 +869,7 @@ CANDIDATE MEMORIES:
|
||||
|
||||
selected_memories = []
|
||||
for memory in candidate_memories:
|
||||
if memory["id"] in response.ids and len(selected_memories) < max_count:
|
||||
if memory["id"] in response.ids and len(selected_memories) < max_count:
|
||||
selected_memories.append(memory)
|
||||
|
||||
@@ -1252,6 +1260,7 @@ class LLMConsolidationService:
|
||||
)
|
||||
logger.info(f"🔄 Memory Operations: {', '.join(operation_details)}")
|
||||
await self.memory_system._refresh_user_cache(user_id)
|
||||
await self.memory_system._refresh_user_cache(user_id)
|
||||
|
||||
return created_count, updated_count, deleted_count, failed_count
|
||||
|
||||
@@ -1377,6 +1386,8 @@ class Filter:
|
||||
self._background_tasks: set = set()
|
||||
self._shutdown_event = asyncio.Event()
|
||||
|
||||
self._embedding_function = None
|
||||
self._skip_detector = None
|
||||
self._embedding_function = None
|
||||
self._skip_detector = None
|
||||
|
||||
@@ -1508,6 +1519,10 @@ class Filter:
|
||||
self, embedding: Union[List[float], np.ndarray]
|
||||
) -> np.ndarray:
|
||||
"""Normalize embedding vector."""
|
||||
if isinstance(embedding, list):
|
||||
embedding = np.array(embedding, dtype=np.float16)
|
||||
else:
|
||||
embedding = embedding.astype(np.float16)
|
||||
if isinstance(embedding, list):
|
||||
embedding = np.array(embedding, dtype=np.float16)
|
||||
else:
|
||||
@@ -1674,6 +1689,7 @@ class Filter:
|
||||
top_score = max(scores)
|
||||
lowest_score = min(scores)
|
||||
median_score = statistics.median(scores)
|
||||
median_score = statistics.median(scores)
|
||||
|
||||
context_label = (
|
||||
"📊 Consolidation candidate memories"
|
||||
@@ -2079,6 +2095,8 @@ class Filter:
|
||||
|
||||
await self._cache_manager.clear_all_caches()
|
||||
|
||||
async def _refresh_user_cache(self, user_id: str) -> None:
|
||||
"""Refresh user cache - clear stale caches and update with fresh embeddings."""
|
||||
async def _refresh_user_cache(self, user_id: str) -> None:
|
||||
"""Refresh user cache - clear stale caches and update with fresh embeddings."""
|
||||
start_time = time.time()
|
||||
@@ -2137,6 +2155,8 @@ class Filter:
|
||||
"""Execute a single memory operation."""
|
||||
try:
|
||||
if operation.operation == Models.MemoryOperationType.CREATE:
|
||||
content_stripped = operation.content.strip()
|
||||
if not content_stripped:
|
||||
content_stripped = operation.content.strip()
|
||||
if not content_stripped:
|
||||
logger.warning(f"⚠️ Skipping CREATE operation: empty content")
|
||||
@@ -2151,6 +2171,8 @@ class Filter:
|
||||
return Models.MemoryOperationType.CREATE.value
|
||||
|
||||
elif operation.operation == Models.MemoryOperationType.UPDATE:
|
||||
id_stripped = operation.id.strip()
|
||||
if not id_stripped:
|
||||
id_stripped = operation.id.strip()
|
||||
if not id_stripped:
|
||||
logger.warning(f"⚠️ Skipping UPDATE operation: empty ID")
|
||||
@@ -2175,6 +2197,8 @@ class Filter:
|
||||
return Models.MemoryOperationType.UPDATE.value
|
||||
|
||||
elif operation.operation == Models.MemoryOperationType.DELETE:
|
||||
id_stripped = operation.id.strip()
|
||||
if not id_stripped:
|
||||
id_stripped = operation.id.strip()
|
||||
if not id_stripped:
|
||||
logger.warning(f"⚠️ Skipping DELETE operation: empty ID")
|
||||
|
||||
Reference in New Issue
Block a user