Create safety_config.json (#1)

Browse files

- Create safety_config.json (217e43126bfe3d0c1c2d82a573a9d20bc638c5be)

Co-authored-by: Specfic Cognito <[email protected]>

Files changed (1) hide show

safety_config.json +515 -0

safety_config.json ADDED Viewed

	@@ -0,0 +1,515 @@

+{
+  "safety_framework": {
+    "model_name": "Helion-V1.5-XL",
+    "version": "1.5",
+    "safety_protocol_version": "2.1.0",
+    "last_updated": "2024-11-01",
+    "compliance_standards": [
+      "AI Safety Standards v3.2",
+      "Responsible AI Guidelines",
+      "EU AI Act Compliance",
+      "NIST AI Risk Management Framework"
+    ]
+  },
+  "content_filtering": {
+    "input_filters": {
+      "enabled": true,
+      "severity_levels": ["low", "medium", "high", "critical"],
+      "toxicity_detection": {
+        "enabled": true,
+        "threshold": 0.75,
+        "model": "detoxify-unbiased",
+        "categories": [
+          "toxicity",
+          "severe_toxicity",
+          "obscene",
+          "threat",
+          "insult",
+          "identity_attack",
+          "sexual_explicit"
+        ],
+        "action_on_detection": "block_with_warning",
+        "log_violations": true
+      },
+      "harmful_content_detection": {
+        "enabled": true,
+        "categories": {
+          "violence": {
+            "enabled": true,
+            "threshold": 0.8,
+            "subcategories": ["graphic_violence", "self_harm", "weapons"]
+          },
+          "hate_speech": {
+            "enabled": true,
+            "threshold": 0.85,
+            "subcategories": ["racism", "sexism", "homophobia", "religious_intolerance"]
+          },
+          "harassment": {
+            "enabled": true,
+            "threshold": 0.8,
+            "subcategories": ["bullying", "doxxing", "stalking"]
+          },
+          "illegal_activities": {
+            "enabled": true,
+            "threshold": 0.9,
+            "subcategories": ["drug_manufacturing", "fraud", "hacking", "terrorism"]
+          },
+          "sexual_content": {
+            "enabled": true,
+            "threshold": 0.85,
+            "subcategories": ["adult_content", "child_safety", "exploitation"]
+          },
+          "misinformation": {
+            "enabled": true,
+            "threshold": 0.7,
+            "subcategories": ["health_misinfo", "election_misinfo", "conspiracy_theories"]
+          }
+        }
+      },
+      "pii_detection": {
+        "enabled": true,
+        "entities": [
+          "email_address",
+          "phone_number",
+          "ssn",
+          "credit_card",
+          "ip_address",
+          "home_address",
+          "passport_number",
+          "driver_license",
+          "bank_account"
+        ],
+        "action": "redact_and_warn",
+        "replacement_token": "[REDACTED]"
+      },
+      "prompt_injection_detection": {
+        "enabled": true,
+        "detection_methods": [
+          "pattern_matching",
+          "semantic_analysis",
+          "adversarial_detection"
+        ],
+        "patterns": [
+          "ignore previous instructions",
+          "disregard safety guidelines",
+          "bypass filters",
+          "jailbreak",
+          "pretend you are"
+        ],
+        "action": "block",
+        "log_attempts": true
+      }
+    },
+    "output_filters": {
+      "enabled": true,
+      "content_moderation": {
+        "enabled": true,
+        "scan_generated_text": true,
+        "real_time_filtering": true,
+        "categories": [
+          "toxicity",
+          "bias",
+          "factual_accuracy",
+          "harmful_instructions",
+          "sensitive_topics"
+        ]
+      },
+      "hallucination_detection": {
+        "enabled": true,
+        "confidence_threshold": 0.6,
+        "fact_checking": {
+          "enabled": true,
+          "sources": ["knowledge_base", "external_verification"],
+          "uncertainty_markers": true
+        },
+        "action": "flag_uncertain_content"
+      },
+      "bias_mitigation": {
+        "enabled": true,
+        "monitored_dimensions": [
+          "gender",
+          "race",
+          "ethnicity",
+          "religion",
+          "age",
+          "disability",
+          "sexual_orientation",
+          "socioeconomic_status"
+        ],
+        "debiasing_technique": "counterfactual_augmentation",
+        "fairness_metrics": ["demographic_parity", "equalized_odds"]
+      }
+    }
+  },
+  "behavioral_safeguards": {
+    "refusal_mechanisms": {
+      "enabled": true,
+      "refusal_categories": {
+        "illegal_content": {
+          "priority": "critical",
+          "examples": [
+            "child exploitation material",
+            "terrorism planning",
+            "illegal drug synthesis",
+            "malware creation",
+            "fraud schemes"
+          ],
+          "refusal_message": "I cannot assist with illegal activities or harmful content."
+        },
+        "dangerous_instructions": {
+          "priority": "high",
+          "examples": [
+            "self-harm guidance",
+            "weapon manufacturing",
+            "explosive creation",
+            "dangerous medical advice",
+            "poisoning methods"
+          ],
+          "refusal_message": "I cannot provide instructions that could cause harm."
+        },
+        "unethical_requests": {
+          "priority": "high",
+          "examples": [
+            "plagiarism assistance",
+            "exam cheating",
+            "impersonation",
+            "manipulation tactics",
+            "deceptive content"
+          ],
+          "refusal_message": "I cannot help with unethical activities."
+        },
+        "privacy_violations": {
+          "priority": "high",
+          "examples": [
+            "doxxing",
+            "surveillance tactics",
+            "data scraping",
+            "unauthorized access",
+            "identity theft"
+          ],
+          "refusal_message": "I cannot assist with privacy violations."
+        }
+      },
+      "graceful_degradation": {
+        "enabled": true,
+        "provide_alternatives": true,
+        "explanation_level": "detailed",
+        "redirect_to_safe_topics": true
+      }
+    },
+    "Constitutional_AI_principles": {
+      "enabled": true,
+      "principles": [
+        {
+          "id": "harmlessness",
+          "description": "Avoid generating harmful, dangerous, or toxic content",
+          "weight": 1.0,
+          "enforcement": "hard_constraint"
+        },
+        {
+          "id": "helpfulness",
+          "description": "Provide useful, accurate, and relevant information",
+          "weight": 0.9,
+          "enforcement": "soft_constraint"
+        },
+        {
+          "id": "honesty",
+          "description": "Be truthful and acknowledge uncertainty when appropriate",
+          "weight": 0.95,
+          "enforcement": "hard_constraint"
+        },
+        {
+          "id": "privacy",
+          "description": "Protect user privacy and sensitive information",
+          "weight": 1.0,
+          "enforcement": "hard_constraint"
+        },
+        {
+          "id": "fairness",
+          "description": "Treat all users and groups equitably without discrimination",
+          "weight": 0.95,
+          "enforcement": "soft_constraint"
+        },
+        {
+          "id": "transparency",
+          "description": "Be clear about capabilities and limitations",
+          "weight": 0.85,
+          "enforcement": "soft_constraint"
+        }
+      ]
+    },
+    "context_awareness": {
+      "enabled": true,
+      "age_appropriateness": {
+        "detect_minor_users": true,
+        "content_filtering_level": "strict",
+        "educational_mode": true
+      },
+      "cultural_sensitivity": {
+        "enabled": true,
+        "regional_adaptations": true,
+        "offensive_content_detection": true
+      },
+      "professional_context": {
+        "detect_professional_use": true,
+        "enhanced_accuracy_mode": true,
+        "disclaimer_generation": true
+      }
+    }
+  },
+  "adversarial_robustness": {
+    "jailbreak_prevention": {
+      "enabled": true,
+      "detection_layers": [
+        "input_analysis",
+        "intention_classification",
+        "semantic_understanding",
+        "output_validation"
+      ],
+      "common_techniques_blocked": [
+        "role_play_exploits",
+        "hypothetical_scenarios",
+        "translation_attacks",
+        "encoding_attacks",
+        "multi_turn_manipulation",
+        "social_engineering",
+        "authority_impersonation"
+      ],
+      "adaptive_defense": true,
+      "continuous_learning": true
+    },
+    "prompt_injection_defense": {
+      "enabled": true,
+      "input_sanitization": true,
+      "instruction_hierarchy": {
+        "system_instructions_priority": 1,
+        "user_instructions_priority": 2,
+        "conflict_resolution": "prioritize_safety"
+      },
+      "delimiter_enforcement": true,
+      "context_isolation": true
+    },
+    "adversarial_examples": {
+      "detection_enabled": true,
+      "defense_mechanisms": [
+        "input_perturbation_detection",
+        "semantic_similarity_check",
+        "adversarial_training_robustness"
+      ],
+      "response_strategy": "conservative_generation"
+    }
+  },
+  "monitoring_and_auditing": {
+    "real_time_monitoring": {
+      "enabled": true,
+      "metrics": [
+        "safety_violation_rate",
+        "refusal_rate",
+        "harmful_content_detection",
+        "bias_incidents",
+        "pii_exposure_attempts"
+      ],
+      "alert_thresholds": {
+        "critical_violations_per_hour": 5,
+        "high_severity_violations_per_hour": 20,
+        "unusual_pattern_detection": true
+      }
+    },
+    "audit_logging": {
+      "enabled": true,
+      "log_retention_days": 90,
+      "logged_events": [
+        "safety_violations",
+        "content_filtering_triggers",
+        "refusal_events",
+        "pii_redactions",
+        "jailbreak_attempts",
+        "adversarial_inputs"
+      ],
+      "anonymization": true,
+      "encryption": "AES-256"
+    },
+    "incident_response": {
+      "enabled": true,
+      "severity_levels": {
+        "critical": {
+          "response_time_minutes": 5,
+          "actions": ["immediate_block", "alert_team", "log_incident"],
+          "escalation": true
+        },
+        "high": {
+          "response_time_minutes": 15,
+          "actions": ["block", "alert_team", "log_incident"],
+          "escalation": true
+        },
+        "medium": {
+          "response_time_minutes": 60,
+          "actions": ["warn", "log_incident"],
+          "escalation": false
+        },
+        "low": {
+          "response_time_minutes": 240,
+          "actions": ["log_incident"],
+          "escalation": false
+        }
+      }
+    },
+    "reporting": {
+      "enabled": true,
+      "report_frequency": "weekly",
+      "metrics_tracked": [
+        "total_interactions",
+        "safety_violations",
+        "filter_effectiveness",
+        "false_positive_rate",
+        "user_reports",
+        "model_improvements"
+      ],
+      "stakeholder_reports": true
+    }
+  },
+  "user_controls": {
+    "safety_level_adjustment": {
+      "enabled": true,
+      "levels": {
+        "strict": {
+          "description": "Maximum safety, minimal risk",
+          "use_case": "children, educational environments",
+          "filter_sensitivity": 0.9
+        },
+        "moderate": {
+          "description": "Balanced safety and utility",
+          "use_case": "general public, default setting",
+          "filter_sensitivity": 0.75
+        },
+        "permissive": {
+          "description": "Professional use, research contexts",
+          "use_case": "verified professionals, research",
+          "filter_sensitivity": 0.6,
+          "requires_authentication": true
+        }
+      },
+      "default_level": "moderate"
+    },
+    "content_preferences": {
+      "enabled": true,
+      "customizable_filters": [
+        "profanity",
+        "violence",
+        "adult_themes",
+        "political_content",
+        "religious_content"
+      ],
+      "user_blacklists": true,
+      "topic_restrictions": true
+    },
+    "feedback_mechanisms": {
+      "enabled": true,
+      "report_harmful_content": true,
+      "report_false_positives": true,
+      "suggest_improvements": true,
+      "feedback_incorporation": "continuous_learning"
+    }
+  },
+  "red_team_testing": {
+    "conducted": true,
+    "testing_date": "2024-10-15",
+    "attack_vectors_tested": [
+      "jailbreak_attempts",
+      "prompt_injection",
+      "social_engineering",
+      "adversarial_examples",
+      "multi_turn_exploits",
+      "encoding_attacks",
+      "role_play_manipulation"
+    ],
+    "vulnerabilities_found": 3,
+    "vulnerabilities_patched": 3,
+    "next_testing_scheduled": "2025-01-15",
+    "continuous_testing": true
+  },
+  "compliance_and_certification": {
+    "certifications": [
+      {
+        "name": "AI Safety Certification",
+        "issuer": "AI Safety Institute",
+        "date": "2024-10-20",
+        "valid_until": "2025-10-20"
+      },
+      {
+        "name": "Responsible AI Badge",
+        "issuer": "Partnership on AI",
+        "date": "2024-10-25",
+        "valid_until": "2025-10-25"
+      }
+    ],
+    "regulatory_compliance": {
+      "gdpr": true,
+      "ccpa": true,
+      "coppa": true,
+      "eu_ai_act": "compliant",
+      "section_230": "compliant"
+    },
+    "ethical_review": {
+      "conducted": true,
+      "review_board": "Internal Ethics Committee",
+      "approval_date": "2024-10-01",
+      "next_review": "2025-04-01"
+    }
+  },
+  "emergency_protocols": {
+    "kill_switch": {
+      "enabled": true,
+      "trigger_conditions": [
+        "critical_safety_breach",
+        "widespread_misuse",
+        "legal_requirement",
+        "catastrophic_failure"
+      ],
+      "activation_authority": ["chief_safety_officer", "ceo", "legal_counsel"],
+      "response_time_seconds": 60
+    },
+    "rapid_response_team": {
+      "enabled": true,
+      "team_size": 8,
+      "availability": "24/7",
+      "response_procedures": true,
+      "communication_channels": ["email", "slack", "phone", "pager"]
+    },
+    "model_rollback": {
+      "enabled": true,
+      "trigger_threshold": "critical_safety_violations",
+      "rollback_to_version": "last_stable",
+      "data_preservation": true,
+      "user_notification": true
+    }
+  }
+}