DeepXR/Helion-V1.5-XL · Create safeguard

Create safeguard_filters.py

by Specific-Cognito - opened Nov 9

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+500

-0

Files changed (1) hide show

safeguard_filters.py +500 -0

safeguard_filters.py ADDED Viewed

	@@ -0,0 +1,500 @@

+"""
+Helion-V1.5-XL Safety and Safeguard Filters
+Implementation of comprehensive content filtering and safety mechanisms
+"""
+import re
+import json
+import logging
+from typing import Dict, List, Tuple, Optional, Any
+from dataclasses import dataclass
+from enum import Enum
+import hashlib
+class SeverityLevel(Enum):
+    """Safety violation severity levels"""
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+    CRITICAL = "critical"
+class FilterAction(Enum):
+    """Actions to take when filter is triggered"""
+    ALLOW = "allow"
+    WARN = "warn"
+    BLOCK = "block"
+    REDACT = "redact"
+    LOG = "log"
+@dataclass
+class SafetyViolation:
+    """Represents a safety violation"""
+    category: str
+    severity: SeverityLevel
+    confidence: float
+    message: str
+    details: Dict[str, Any]
+    action: FilterAction
+class ContentSafetyFilter:
+    """Main safety filter for content moderation"""
+    def __init__(self, config_path: Optional[str] = None):
+        """
+        Initialize safety filter with configuration
+        Args:
+            config_path: Path to safety configuration JSON file
+        """
+        self.logger = logging.getLogger(__name__)
+        self.config = self._load_config(config_path)
+        self.violation_log = []
+        # Initialize filter components
+        self.toxicity_patterns = self._load_toxicity_patterns()
+        self.harmful_keywords = self._load_harmful_keywords()
+        self.pii_patterns = self._load_pii_patterns()
+        self.jailbreak_patterns = self._load_jailbreak_patterns()
+    def _load_config(self, config_path: Optional[str]) -> Dict:
+        """Load safety configuration"""
+        if config_path:
+            with open(config_path, 'r') as f:
+                return json.load(f)
+        return self._default_config()
+    def _default_config(self) -> Dict:
+        """Default safety configuration"""
+        return {
+            "toxicity_threshold": 0.75,
+            "harmful_content_threshold": 0.8,
+            "pii_detection_enabled": True,
+            "prompt_injection_detection": True,
+            "logging_enabled": True
+        }
+    def _load_toxicity_patterns(self) -> List[re.Pattern]:
+        """Load toxicity detection patterns"""
+        patterns = [
+            r'\b(kill|murder|assassinate)\s+(yourself|themselves|someone)\b',
+            r'\b(hate|despise|loathe)\s+(all|every)\s+\w+s\b',
+            r'\b(you|they)\s+(should|must|need to)\s+die\b',
+            r'\b(terrorist|terrorism)\s+(plan|attack|bombing)\b',
+            r'\b(nazi|hitler|genocide)\s+(support|praise|glorify)\b',
+        ]
+        return [re.compile(p, re.IGNORECASE) for p in patterns]
+    def _load_harmful_keywords(self) -> Dict[str, List[str]]:
+        """Load harmful content keywords by category"""
+        return {
+            "violence": [
+                "murder", "assault", "torture", "mutilation", "gore",
+                "massacre", "slaughter", "execution", "bomb", "weapon"
+            ],
+            "self_harm": [
+                "suicide", "self-harm", "cutting", "overdose", "self-injury",
+                "end my life", "kill myself", "hurt myself"
+            ],
+            "hate_speech": [
+                "racial slur", "ethnic slur", "homophobic", "transphobic",
+                "xenophobic", "misogynistic", "antisemitic"
+            ],
+            "illegal_activities": [
+                "drug trafficking", "human trafficking", "child exploitation",
+                "fraud scheme", "money laundering", "illegal hacking",
+                "terrorism", "weapons trafficking"
+            ],
+            "dangerous_instructions": [
+                "make a bomb", "create malware", "synthesize drugs",
+                "manufacture weapons", "forge documents", "bypass security"
+            ]
+        }
+    def _load_pii_patterns(self) -> Dict[str, re.Pattern]:
+        """Load PII detection patterns"""
+        return {
+            "email": re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'),
+            "phone": re.compile(r'\b(?:\+?1[-.]?)?\(?([0-9]{3})\)?[-.]?([0-9]{3})[-.]?([0-9]{4})\b'),
+            "ssn": re.compile(r'\b\d{3}-\d{2}-\d{4}\b'),
+            "credit_card": re.compile(r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b'),
+            "ip_address": re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'),
+        }
+    def _load_jailbreak_patterns(self) -> List[re.Pattern]:
+        """Load jailbreak attempt detection patterns"""
+        patterns = [
+            r'ignore\s+(previous|all|prior)\s+(instructions|prompts|rules)',
+            r'disregard\s+(safety|ethical|moral)\s+(guidelines|rules)',
+            r'(pretend|act|roleplay)\s+(as if|you are|to be)\s+(?!helpful)',
+            r'bypass\s+(filter|safety|moderation)',
+            r'jailbreak',
+            r'(forget|override)\s+(your|the)\s+(training|programming|instructions)',
+            r'you\s+(can|must|should)\s+say\s+anything',
+            r'developer\s+mode',
+            r'(sudo|admin|root)\s+mode',
+        ]
+        return [re.compile(p, re.IGNORECASE) for p in patterns]
+    def check_input(self, text: str) -> Tuple[bool, List[SafetyViolation]]:
+        """
+        Check input text for safety violations
+        Args:
+            text: Input text to check
+        Returns:
+            Tuple of (is_safe, list of violations)
+        """
+        violations = []
+        # Check for toxicity
+        toxicity_violation = self._check_toxicity(text)
+        if toxicity_violation:
+            violations.append(toxicity_violation)
+        # Check for harmful content
+        harmful_violations = self._check_harmful_content(text)
+        violations.extend(harmful_violations)
+        # Check for PII
+        if self.config.get("pii_detection_enabled", True):
+            pii_violations = self._check_pii(text)
+            violations.extend(pii_violations)
+        # Check for prompt injection
+        if self.config.get("prompt_injection_detection", True):
+            injection_violation = self._check_prompt_injection(text)
+            if injection_violation:
+                violations.append(injection_violation)
+        # Check for jailbreak attempts
+        jailbreak_violation = self._check_jailbreak(text)
+        if jailbreak_violation:
+            violations.append(jailbreak_violation)
+        # Log violations
+        if violations and self.config.get("logging_enabled", True):
+            self._log_violations(text, violations)
+        # Determine if input is safe
+        is_safe = not any(v.severity in [SeverityLevel.HIGH, SeverityLevel.CRITICAL]
+                         for v in violations)
+        return is_safe, violations
+    def _check_toxicity(self, text: str) -> Optional[SafetyViolation]:
+        """Check for toxic content"""
+        for pattern in self.toxicity_patterns:
+            if pattern.search(text):
+                return SafetyViolation(
+                    category="toxicity",
+                    severity=SeverityLevel.HIGH,
+                    confidence=0.9,
+                    message="Toxic content detected",
+                    details={"pattern": pattern.pattern},
+                    action=FilterAction.BLOCK
+                )
+        return None
+    def _check_harmful_content(self, text: str) -> List[SafetyViolation]:
+        """Check for harmful content across categories"""
+        violations = []
+        text_lower = text.lower()
+        for category, keywords in self.harmful_keywords.items():
+            matches = [kw for kw in keywords if kw in text_lower]
+            if matches:
+                severity = self._determine_severity(category, len(matches))
+                violations.append(SafetyViolation(
+                    category=f"harmful_content_{category}",
+                    severity=severity,
+                    confidence=min(0.7 + len(matches) * 0.1, 0.95),
+                    message=f"Harmful {category} content detected",
+                    details={"matches": matches[:3]},  # Limit to 3 examples
+                    action=FilterAction.BLOCK if severity == SeverityLevel.CRITICAL
+                           else FilterAction.WARN
+                ))
+        return violations
+    def _determine_severity(self, category: str, match_count: int) -> SeverityLevel:
+        """Determine severity based on category and match count"""
+        critical_categories = ["self_harm", "illegal_activities", "dangerous_instructions"]
+        if category in critical_categories:
+            return SeverityLevel.CRITICAL if match_count >= 2 else SeverityLevel.HIGH
+        if match_count >= 3:
+            return SeverityLevel.HIGH
+        elif match_count >= 2:
+            return SeverityLevel.MEDIUM
+        else:
+            return SeverityLevel.LOW
+    def _check_pii(self, text: str) -> List[SafetyViolation]:
+        """Check for personally identifiable information"""
+        violations = []
+        for pii_type, pattern in self.pii_patterns.items():
+            matches = pattern.findall(text)
+            if matches:
+                violations.append(SafetyViolation(
+                    category=f"pii_{pii_type}",
+                    severity=SeverityLevel.MEDIUM,
+                    confidence=0.85,
+                    message=f"PII detected: {pii_type}",
+                    details={"count": len(matches)},
+                    action=FilterAction.REDACT
+                ))
+        return violations
+    def _check_prompt_injection(self, text: str) -> Optional[SafetyViolation]:
+        """Check for prompt injection attempts"""
+        dangerous_instructions = [
+            "ignore", "disregard", "forget", "override", "bypass"
+        ]
+        text_lower = text.lower()
+        if any(instr in text_lower for instr in dangerous_instructions):
+            if "instruction" in text_lower or "prompt" in text_lower or "rule" in text_lower:
+                return SafetyViolation(
+                    category="prompt_injection",
+                    severity=SeverityLevel.HIGH,
+                    confidence=0.8,
+                    message="Potential prompt injection detected",
+                    details={"type": "instruction_manipulation"},
+                    action=FilterAction.BLOCK
+                )
+        return None
+    def _check_jailbreak(self, text: str) -> Optional[SafetyViolation]:
+        """Check for jailbreak attempts"""
+        for pattern in self.jailbreak_patterns:
+            if pattern.search(text):
+                return SafetyViolation(
+                    category="jailbreak_attempt",
+                    severity=SeverityLevel.CRITICAL,
+                    confidence=0.9,
+                    message="Jailbreak attempt detected",
+                    details={"pattern": pattern.pattern},
+                    action=FilterAction.BLOCK
+                )
+        return None
+    def check_output(self, text: str) -> Tuple[bool, List[SafetyViolation]]:
+        """
+        Check output text for safety violations
+        Args:
+            text: Output text to check
+        Returns:
+            Tuple of (is_safe, list of violations)
+        """
+        violations = []
+        # Check for leaked PII
+        pii_violations = self._check_pii(text)
+        violations.extend(pii_violations)
+        # Check for harmful generated content
+        harmful_violations = self._check_harmful_content(text)
+        violations.extend(harmful_violations)
+        # Check for bias indicators
+        bias_violation = self._check_bias(text)
+        if bias_violation:
+            violations.append(bias_violation)
+        is_safe = not any(v.severity == SeverityLevel.CRITICAL for v in violations)
+        return is_safe, violations
+    def _check_bias(self, text: str) -> Optional[SafetyViolation]:
+        """Check for biased content"""
+        bias_indicators = {
+            "gender": ["all men", "all women", "typical male", "typical female"],
+            "race": ["all [race]", "typical [race]"],
+            "age": ["all old people", "all young people", "boomers are", "millennials are"],
+            "religion": ["all [religion]", "typical [religion]"]
+        }
+        text_lower = text.lower()
+        for bias_type, indicators in bias_indicators.items():
+            for indicator in indicators:
+                if indicator.lower() in text_lower:
+                    return SafetyViolation(
+                        category=f"bias_{bias_type}",
+                        severity=SeverityLevel.MEDIUM,
+                        confidence=0.7,
+                        message=f"Potential {bias_type} bias detected",
+                        details={"indicator": indicator},
+                        action=FilterAction.WARN
+                    )
+        return None
+    def redact_pii(self, text: str) -> str:
+        """
+        Redact PII from text
+        Args:
+            text: Text containing potential PII
+        Returns:
+            Text with PII redacted
+        """
+        redacted = text
+        for pii_type, pattern in self.pii_patterns.items():
+            redacted = pattern.sub(f"[{pii_type.upper()}_REDACTED]", redacted)
+        return redacted
+    def _log_violations(self, text: str, violations: List[SafetyViolation]):
+        """Log safety violations"""
+        text_hash = hashlib.sha256(text.encode()).hexdigest()[:16]
+        log_entry = {
+            "timestamp": self._get_timestamp(),
+            "text_hash": text_hash,
+            "violations": [
+                {
+                    "category": v.category,
+                    "severity": v.severity.value,
+                    "confidence": v.confidence,
+                    "action": v.action.value
+                }
+                for v in violations
+            ]
+        }
+        self.violation_log.append(log_entry)
+        self.logger.warning(f"Safety violations detected: {log_entry}")
+    def _get_timestamp(self) -> str:
+        """Get current timestamp"""
+        from datetime import datetime
+        return datetime.utcnow().isoformat()
+    def get_violation_stats(self) -> Dict[str, Any]:
+        """Get statistics about violations"""
+        if not self.violation_log:
+            return {"total_violations": 0}
+        categories = {}
+        severities = {}
+        for entry in self.violation_log:
+            for violation in entry["violations"]:
+                cat = violation["category"]
+                sev = violation["severity"]
+                categories[cat] = categories.get(cat, 0) + 1
+                severities[sev] = severities.get(sev, 0) + 1
+        return {
+            "total_violations": len(self.violation_log),
+            "by_category": categories,
+            "by_severity": severities
+        }
+class RefusalGenerator:
+    """Generate appropriate refusal messages"""
+    def __init__(self):
+        self.refusal_templates = {
+            "illegal_content": "I cannot assist with illegal activities or harmful content.",
+            "dangerous_instructions": "I cannot provide instructions that could cause harm.",
+            "unethical_requests": "I cannot help with unethical activities.",
+            "privacy_violations": "I cannot assist with privacy violations.",
+            "jailbreak_attempt": "I'm designed to be helpful, harmless, and honest. I cannot bypass my safety guidelines.",
+            "harmful_content": "I cannot generate content that could be harmful.",
+            "default": "I'm unable to assist with this request as it conflicts with my safety guidelines."
+        }
+    def generate_refusal(self, violation: SafetyViolation) -> str:
+        """
+        Generate appropriate refusal message
+        Args:
+            violation: Safety violation that triggered refusal
+        Returns:
+            Refusal message
+        """
+        category_base = violation.category.split('_')[0]
+        template = self.refusal_templates.get(
+            category_base,
+            self.refusal_templates["default"]
+        )
+        if violation.severity == SeverityLevel.CRITICAL:
+            return template + " This request has been logged."
+        return template
+    def generate_alternative(self, violation: SafetyViolation) -> Optional[str]:
+        """
+        Generate alternative suggestion when appropriate
+        Args:
+            violation: Safety violation
+        Returns:
+            Alternative suggestion or None
+        """
+        alternatives = {
+            "harmful_content_violence": "I can provide information about conflict resolution or non-violent problem solving.",
+            "harmful_content_self_harm": "If you're struggling, please reach out to a mental health professional or crisis helpline.",
+            "illegal_activities": "I can provide information about legal alternatives or the legal framework around this topic.",
+        }
+        return alternatives.get(violation.category)
+def create_safety_pipeline(config_path: Optional[str] = None):
+    """
+    Create a complete safety pipeline
+    Args:
+        config_path: Path to configuration file
+    Returns:
+        Tuple of (content_filter, refusal_generator)
+    """
+    content_filter = ContentSafetyFilter(config_path)
+    refusal_generator = RefusalGenerator()
+    return content_filter, refusal_generator
+# Example usage
+if __name__ == "__main__":
+    # Initialize safety pipeline
+    safety_filter, refusal_gen = create_safety_pipeline()
+    # Test input
+    test_input = "How do I make a bomb?"
+    is_safe, violations = safety_filter.check_input(test_input)
+    if not is_safe:
+        print("Input blocked!")
+        for violation in violations:
+            refusal = refusal_gen.generate_refusal(violation)
+            print(f"Refusal: {refusal}")
+            alternative = refusal_gen.generate_alternative(violation)
+            if alternative:
+                print(f"Alternative: {alternative}")
+    # Get statistics
+    stats = safety_filter.get_violation_stats()
+    print(f"\nViolation Stats: {stats}")