From 308cf1285e689bf6677312470b81b86742406375 Mon Sep 17 00:00:00 2001 From: mtayfur Date: Wed, 26 Nov 2025 16:59:54 +0300 Subject: [PATCH] refactor(SkipDetector): optimize line structure handling and reduce redundant splits Refactors the skip detection logic to precompute and reuse line splits and non-empty line lists, reducing repeated operations and improving efficiency and readability, while also fixing edge cases for empty messages and ensuring more robust pattern detection. --- memory_system.py | 97 +++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/memory_system.py b/memory_system.py index 9c6bdf5..3b2676c 100644 --- a/memory_system.py +++ b/memory_system.py @@ -482,6 +482,14 @@ class SkipDetector: def _fast_path_skip_detection(self, message: str) -> Optional[bool]: """Language-agnostic structural pattern detection with high confidence and low false positive rate.""" msg_len = len(message) + if msg_len == 0: + return None + + # Pre-compute line structures used by multiple patterns + lines = message.split("\n") + line_count = len(lines) + non_empty_lines = [line for line in lines if line.strip()] + non_empty_count = len(non_empty_lines) # Pattern 1: Multiple URLs (5+ full URLs indicates link lists or technical references) url_pattern_count = message.count("http://") + message.count("https://") @@ -489,8 +497,7 @@ class SkipDetector: return True # Pattern 2: Long unbroken alphanumeric strings (tokens, hashes, base64) - words = message.split() - for word in words: + for word in message.split(): cleaned = word.strip('.,;:!?()[]{}"\'"') if len(cleaned) > 80 and cleaned.replace("-", "").replace("_", "").isalnum(): return True @@ -502,25 +509,25 @@ class SkipDetector: return True # Pattern 4: Command-line patterns with context-aware detection - lines_stripped = [line.strip() for line in message.split("\n") if line.strip()] - if lines_stripped: + if non_empty_lines: actual_command_lines = 0 - for line in lines_stripped: - if line.startswith("$ ") and len(line) > 2: - parts = line[2:].split() + for line in non_empty_lines: + stripped = line.strip() + if stripped.startswith("$ ") and len(stripped) > 2: + parts = stripped[2:].split() if parts and parts[0].isalnum(): actual_command_lines += 1 - elif "$ " in line: - dollar_index = line.find("$ ") - if dollar_index > 0 and line[dollar_index - 1] in (" ", ":", "\t"): - parts = line[dollar_index + 2 :].split() + elif "$ " in stripped: + dollar_index = stripped.find("$ ") + if dollar_index > 0 and stripped[dollar_index - 1] in (" ", ":", "\t"): + parts = stripped[dollar_index + 2 :].split() if parts and len(parts[0]) > 0 and (parts[0].isalnum() or parts[0] in ["curl", "wget", "git", "npm", "pip", "docker"]): actual_command_lines += 1 - elif line.startswith("# ") and len(line) > 2: - rest = line[2:].strip() + elif stripped.startswith("# ") and len(stripped) > 2: + rest = stripped[2:].strip() if rest and not rest[0].isupper() and " " in rest: actual_command_lines += 1 - elif line.startswith("> ") and len(line) > 2: + elif stripped.startswith("> ") and len(stripped) > 2: pass if actual_command_lines >= 1 and any(c in message for c in ["http://", "https://", " | "]): @@ -546,50 +553,40 @@ class SkipDetector: return True # Pattern 7: Structured nested content with colons (key: value patterns) - line_count = message.count("\n") - if line_count >= 8: - lines = message.split("\n") - non_empty_lines = [line for line in lines if line.strip()] - if non_empty_lines: - colon_lines = sum(1 for line in non_empty_lines if ":" in line and not line.strip().startswith("#")) - indented_lines = sum(1 for line in non_empty_lines if line.startswith((" ", "\t"))) + if line_count >= 8 and non_empty_count > 0: + colon_lines = sum(1 for line in non_empty_lines if ":" in line and not line.strip().startswith("#")) + indented_lines = sum(1 for line in non_empty_lines if line.startswith((" ", "\t"))) - if colon_lines / len(non_empty_lines) > 0.4 and indented_lines / len(non_empty_lines) > 0.5: - words_outside_kv = 0 - for line in non_empty_lines: - if ":" not in line: - words_outside_kv += len(line.split()) + if colon_lines / non_empty_count > 0.4 and indented_lines / non_empty_count > 0.5: + words_outside_kv = 0 + for line in non_empty_lines: + if ":" not in line: + words_outside_kv += len(line.split()) - if words_outside_kv < 5: - return True + if words_outside_kv < 5: + return True # Pattern 8: Highly structured multi-line content (require markup chars for technical confidence) - if line_count > 15: - lines = message.split("\n") - non_empty_lines = [line for line in lines if line.strip()] - if non_empty_lines: - markup_in_lines = sum(1 for line in non_empty_lines if any(c in line for c in "{}[]<>")) - structured_lines = sum(1 for line in non_empty_lines if line.startswith((" ", "\t"))) + if line_count > 15 and non_empty_count > 0: + markup_in_lines = sum(1 for line in non_empty_lines if any(c in line for c in "{}[]<>")) + structured_lines = sum(1 for line in non_empty_lines if line.startswith((" ", "\t"))) - if markup_in_lines / len(non_empty_lines) > 0.3: + if markup_in_lines / non_empty_count > 0.3: + return True + elif structured_lines / non_empty_count > 0.6: + operators = ["=", "+", "-", "*", "/", "<", ">", "&", "|", "!", ":", "?"] + operator_count = sum(message.count(op) for op in operators) + if (operator_count / msg_len) > 0.05: return True - elif structured_lines / len(non_empty_lines) > 0.6: - operators = ["=", "+", "-", "*", "/", "<", ">", "&", "|", "!", ":", "?"] - operator_count = sum(message.count(op) for op in operators) - if (operator_count / msg_len) > 0.05: - return True # Pattern 9: Code-like indentation pattern (require code indicators to avoid false positives from bullet lists) - if line_count >= 3: - lines = message.split("\n") - non_empty_lines = [line for line in lines if line.strip()] - if non_empty_lines: - indented_lines = sum(1 for line in non_empty_lines if line[0] in (" ", "\t")) - if indented_lines / len(non_empty_lines) > 0.5: - code_ending_chars = ["{", "}", "(", ")", ";"] - lines_with_code_endings = sum(1 for line in non_empty_lines if line.strip().endswith(tuple(code_ending_chars))) - if lines_with_code_endings / len(non_empty_lines) > 0.2: - return True + if line_count >= 3 and non_empty_count > 0: + indented_lines = sum(1 for line in non_empty_lines if line[0] in (" ", "\t")) + if indented_lines / non_empty_count > 0.5: + code_ending_chars = ["{", "}", "(", ")", ";"] + lines_with_code_endings = sum(1 for line in non_empty_lines if line.strip().endswith(tuple(code_ending_chars))) + if lines_with_code_endings / non_empty_count > 0.2: + return True # Pattern 10: Very high special character ratio (encoded data, technical output) if msg_len > 50: