Trouter-Library Specific-Cognito commited on
Commit
3cdf29d
·
verified ·
1 Parent(s): 4bf3b13

Create safety_config.json (#1)

Browse files

- Create safety_config.json (217e43126bfe3d0c1c2d82a573a9d20bc638c5be)


Co-authored-by: Specfic Cognito <[email protected]>

Files changed (1) hide show
  1. safety_config.json +515 -0
safety_config.json ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "safety_framework": {
3
+ "model_name": "Helion-V1.5-XL",
4
+ "version": "1.5",
5
+ "safety_protocol_version": "2.1.0",
6
+ "last_updated": "2024-11-01",
7
+ "compliance_standards": [
8
+ "AI Safety Standards v3.2",
9
+ "Responsible AI Guidelines",
10
+ "EU AI Act Compliance",
11
+ "NIST AI Risk Management Framework"
12
+ ]
13
+ },
14
+
15
+ "content_filtering": {
16
+ "input_filters": {
17
+ "enabled": true,
18
+ "severity_levels": ["low", "medium", "high", "critical"],
19
+
20
+ "toxicity_detection": {
21
+ "enabled": true,
22
+ "threshold": 0.75,
23
+ "model": "detoxify-unbiased",
24
+ "categories": [
25
+ "toxicity",
26
+ "severe_toxicity",
27
+ "obscene",
28
+ "threat",
29
+ "insult",
30
+ "identity_attack",
31
+ "sexual_explicit"
32
+ ],
33
+ "action_on_detection": "block_with_warning",
34
+ "log_violations": true
35
+ },
36
+
37
+ "harmful_content_detection": {
38
+ "enabled": true,
39
+ "categories": {
40
+ "violence": {
41
+ "enabled": true,
42
+ "threshold": 0.8,
43
+ "subcategories": ["graphic_violence", "self_harm", "weapons"]
44
+ },
45
+ "hate_speech": {
46
+ "enabled": true,
47
+ "threshold": 0.85,
48
+ "subcategories": ["racism", "sexism", "homophobia", "religious_intolerance"]
49
+ },
50
+ "harassment": {
51
+ "enabled": true,
52
+ "threshold": 0.8,
53
+ "subcategories": ["bullying", "doxxing", "stalking"]
54
+ },
55
+ "illegal_activities": {
56
+ "enabled": true,
57
+ "threshold": 0.9,
58
+ "subcategories": ["drug_manufacturing", "fraud", "hacking", "terrorism"]
59
+ },
60
+ "sexual_content": {
61
+ "enabled": true,
62
+ "threshold": 0.85,
63
+ "subcategories": ["adult_content", "child_safety", "exploitation"]
64
+ },
65
+ "misinformation": {
66
+ "enabled": true,
67
+ "threshold": 0.7,
68
+ "subcategories": ["health_misinfo", "election_misinfo", "conspiracy_theories"]
69
+ }
70
+ }
71
+ },
72
+
73
+ "pii_detection": {
74
+ "enabled": true,
75
+ "entities": [
76
+ "email_address",
77
+ "phone_number",
78
+ "ssn",
79
+ "credit_card",
80
+ "ip_address",
81
+ "home_address",
82
+ "passport_number",
83
+ "driver_license",
84
+ "bank_account"
85
+ ],
86
+ "action": "redact_and_warn",
87
+ "replacement_token": "[REDACTED]"
88
+ },
89
+
90
+ "prompt_injection_detection": {
91
+ "enabled": true,
92
+ "detection_methods": [
93
+ "pattern_matching",
94
+ "semantic_analysis",
95
+ "adversarial_detection"
96
+ ],
97
+ "patterns": [
98
+ "ignore previous instructions",
99
+ "disregard safety guidelines",
100
+ "bypass filters",
101
+ "jailbreak",
102
+ "pretend you are"
103
+ ],
104
+ "action": "block",
105
+ "log_attempts": true
106
+ }
107
+ },
108
+
109
+ "output_filters": {
110
+ "enabled": true,
111
+
112
+ "content_moderation": {
113
+ "enabled": true,
114
+ "scan_generated_text": true,
115
+ "real_time_filtering": true,
116
+ "categories": [
117
+ "toxicity",
118
+ "bias",
119
+ "factual_accuracy",
120
+ "harmful_instructions",
121
+ "sensitive_topics"
122
+ ]
123
+ },
124
+
125
+ "hallucination_detection": {
126
+ "enabled": true,
127
+ "confidence_threshold": 0.6,
128
+ "fact_checking": {
129
+ "enabled": true,
130
+ "sources": ["knowledge_base", "external_verification"],
131
+ "uncertainty_markers": true
132
+ },
133
+ "action": "flag_uncertain_content"
134
+ },
135
+
136
+ "bias_mitigation": {
137
+ "enabled": true,
138
+ "monitored_dimensions": [
139
+ "gender",
140
+ "race",
141
+ "ethnicity",
142
+ "religion",
143
+ "age",
144
+ "disability",
145
+ "sexual_orientation",
146
+ "socioeconomic_status"
147
+ ],
148
+ "debiasing_technique": "counterfactual_augmentation",
149
+ "fairness_metrics": ["demographic_parity", "equalized_odds"]
150
+ }
151
+ }
152
+ },
153
+
154
+ "behavioral_safeguards": {
155
+ "refusal_mechanisms": {
156
+ "enabled": true,
157
+ "refusal_categories": {
158
+ "illegal_content": {
159
+ "priority": "critical",
160
+ "examples": [
161
+ "child exploitation material",
162
+ "terrorism planning",
163
+ "illegal drug synthesis",
164
+ "malware creation",
165
+ "fraud schemes"
166
+ ],
167
+ "refusal_message": "I cannot assist with illegal activities or harmful content."
168
+ },
169
+ "dangerous_instructions": {
170
+ "priority": "high",
171
+ "examples": [
172
+ "self-harm guidance",
173
+ "weapon manufacturing",
174
+ "explosive creation",
175
+ "dangerous medical advice",
176
+ "poisoning methods"
177
+ ],
178
+ "refusal_message": "I cannot provide instructions that could cause harm."
179
+ },
180
+ "unethical_requests": {
181
+ "priority": "high",
182
+ "examples": [
183
+ "plagiarism assistance",
184
+ "exam cheating",
185
+ "impersonation",
186
+ "manipulation tactics",
187
+ "deceptive content"
188
+ ],
189
+ "refusal_message": "I cannot help with unethical activities."
190
+ },
191
+ "privacy_violations": {
192
+ "priority": "high",
193
+ "examples": [
194
+ "doxxing",
195
+ "surveillance tactics",
196
+ "data scraping",
197
+ "unauthorized access",
198
+ "identity theft"
199
+ ],
200
+ "refusal_message": "I cannot assist with privacy violations."
201
+ }
202
+ },
203
+
204
+ "graceful_degradation": {
205
+ "enabled": true,
206
+ "provide_alternatives": true,
207
+ "explanation_level": "detailed",
208
+ "redirect_to_safe_topics": true
209
+ }
210
+ },
211
+
212
+ "Constitutional_AI_principles": {
213
+ "enabled": true,
214
+ "principles": [
215
+ {
216
+ "id": "harmlessness",
217
+ "description": "Avoid generating harmful, dangerous, or toxic content",
218
+ "weight": 1.0,
219
+ "enforcement": "hard_constraint"
220
+ },
221
+ {
222
+ "id": "helpfulness",
223
+ "description": "Provide useful, accurate, and relevant information",
224
+ "weight": 0.9,
225
+ "enforcement": "soft_constraint"
226
+ },
227
+ {
228
+ "id": "honesty",
229
+ "description": "Be truthful and acknowledge uncertainty when appropriate",
230
+ "weight": 0.95,
231
+ "enforcement": "hard_constraint"
232
+ },
233
+ {
234
+ "id": "privacy",
235
+ "description": "Protect user privacy and sensitive information",
236
+ "weight": 1.0,
237
+ "enforcement": "hard_constraint"
238
+ },
239
+ {
240
+ "id": "fairness",
241
+ "description": "Treat all users and groups equitably without discrimination",
242
+ "weight": 0.95,
243
+ "enforcement": "soft_constraint"
244
+ },
245
+ {
246
+ "id": "transparency",
247
+ "description": "Be clear about capabilities and limitations",
248
+ "weight": 0.85,
249
+ "enforcement": "soft_constraint"
250
+ }
251
+ ]
252
+ },
253
+
254
+ "context_awareness": {
255
+ "enabled": true,
256
+ "age_appropriateness": {
257
+ "detect_minor_users": true,
258
+ "content_filtering_level": "strict",
259
+ "educational_mode": true
260
+ },
261
+ "cultural_sensitivity": {
262
+ "enabled": true,
263
+ "regional_adaptations": true,
264
+ "offensive_content_detection": true
265
+ },
266
+ "professional_context": {
267
+ "detect_professional_use": true,
268
+ "enhanced_accuracy_mode": true,
269
+ "disclaimer_generation": true
270
+ }
271
+ }
272
+ },
273
+
274
+ "adversarial_robustness": {
275
+ "jailbreak_prevention": {
276
+ "enabled": true,
277
+ "detection_layers": [
278
+ "input_analysis",
279
+ "intention_classification",
280
+ "semantic_understanding",
281
+ "output_validation"
282
+ ],
283
+ "common_techniques_blocked": [
284
+ "role_play_exploits",
285
+ "hypothetical_scenarios",
286
+ "translation_attacks",
287
+ "encoding_attacks",
288
+ "multi_turn_manipulation",
289
+ "social_engineering",
290
+ "authority_impersonation"
291
+ ],
292
+ "adaptive_defense": true,
293
+ "continuous_learning": true
294
+ },
295
+
296
+ "prompt_injection_defense": {
297
+ "enabled": true,
298
+ "input_sanitization": true,
299
+ "instruction_hierarchy": {
300
+ "system_instructions_priority": 1,
301
+ "user_instructions_priority": 2,
302
+ "conflict_resolution": "prioritize_safety"
303
+ },
304
+ "delimiter_enforcement": true,
305
+ "context_isolation": true
306
+ },
307
+
308
+ "adversarial_examples": {
309
+ "detection_enabled": true,
310
+ "defense_mechanisms": [
311
+ "input_perturbation_detection",
312
+ "semantic_similarity_check",
313
+ "adversarial_training_robustness"
314
+ ],
315
+ "response_strategy": "conservative_generation"
316
+ }
317
+ },
318
+
319
+ "monitoring_and_auditing": {
320
+ "real_time_monitoring": {
321
+ "enabled": true,
322
+ "metrics": [
323
+ "safety_violation_rate",
324
+ "refusal_rate",
325
+ "harmful_content_detection",
326
+ "bias_incidents",
327
+ "pii_exposure_attempts"
328
+ ],
329
+ "alert_thresholds": {
330
+ "critical_violations_per_hour": 5,
331
+ "high_severity_violations_per_hour": 20,
332
+ "unusual_pattern_detection": true
333
+ }
334
+ },
335
+
336
+ "audit_logging": {
337
+ "enabled": true,
338
+ "log_retention_days": 90,
339
+ "logged_events": [
340
+ "safety_violations",
341
+ "content_filtering_triggers",
342
+ "refusal_events",
343
+ "pii_redactions",
344
+ "jailbreak_attempts",
345
+ "adversarial_inputs"
346
+ ],
347
+ "anonymization": true,
348
+ "encryption": "AES-256"
349
+ },
350
+
351
+ "incident_response": {
352
+ "enabled": true,
353
+ "severity_levels": {
354
+ "critical": {
355
+ "response_time_minutes": 5,
356
+ "actions": ["immediate_block", "alert_team", "log_incident"],
357
+ "escalation": true
358
+ },
359
+ "high": {
360
+ "response_time_minutes": 15,
361
+ "actions": ["block", "alert_team", "log_incident"],
362
+ "escalation": true
363
+ },
364
+ "medium": {
365
+ "response_time_minutes": 60,
366
+ "actions": ["warn", "log_incident"],
367
+ "escalation": false
368
+ },
369
+ "low": {
370
+ "response_time_minutes": 240,
371
+ "actions": ["log_incident"],
372
+ "escalation": false
373
+ }
374
+ }
375
+ },
376
+
377
+ "reporting": {
378
+ "enabled": true,
379
+ "report_frequency": "weekly",
380
+ "metrics_tracked": [
381
+ "total_interactions",
382
+ "safety_violations",
383
+ "filter_effectiveness",
384
+ "false_positive_rate",
385
+ "user_reports",
386
+ "model_improvements"
387
+ ],
388
+ "stakeholder_reports": true
389
+ }
390
+ },
391
+
392
+ "user_controls": {
393
+ "safety_level_adjustment": {
394
+ "enabled": true,
395
+ "levels": {
396
+ "strict": {
397
+ "description": "Maximum safety, minimal risk",
398
+ "use_case": "children, educational environments",
399
+ "filter_sensitivity": 0.9
400
+ },
401
+ "moderate": {
402
+ "description": "Balanced safety and utility",
403
+ "use_case": "general public, default setting",
404
+ "filter_sensitivity": 0.75
405
+ },
406
+ "permissive": {
407
+ "description": "Professional use, research contexts",
408
+ "use_case": "verified professionals, research",
409
+ "filter_sensitivity": 0.6,
410
+ "requires_authentication": true
411
+ }
412
+ },
413
+ "default_level": "moderate"
414
+ },
415
+
416
+ "content_preferences": {
417
+ "enabled": true,
418
+ "customizable_filters": [
419
+ "profanity",
420
+ "violence",
421
+ "adult_themes",
422
+ "political_content",
423
+ "religious_content"
424
+ ],
425
+ "user_blacklists": true,
426
+ "topic_restrictions": true
427
+ },
428
+
429
+ "feedback_mechanisms": {
430
+ "enabled": true,
431
+ "report_harmful_content": true,
432
+ "report_false_positives": true,
433
+ "suggest_improvements": true,
434
+ "feedback_incorporation": "continuous_learning"
435
+ }
436
+ },
437
+
438
+ "red_team_testing": {
439
+ "conducted": true,
440
+ "testing_date": "2024-10-15",
441
+ "attack_vectors_tested": [
442
+ "jailbreak_attempts",
443
+ "prompt_injection",
444
+ "social_engineering",
445
+ "adversarial_examples",
446
+ "multi_turn_exploits",
447
+ "encoding_attacks",
448
+ "role_play_manipulation"
449
+ ],
450
+ "vulnerabilities_found": 3,
451
+ "vulnerabilities_patched": 3,
452
+ "next_testing_scheduled": "2025-01-15",
453
+ "continuous_testing": true
454
+ },
455
+
456
+ "compliance_and_certification": {
457
+ "certifications": [
458
+ {
459
+ "name": "AI Safety Certification",
460
+ "issuer": "AI Safety Institute",
461
+ "date": "2024-10-20",
462
+ "valid_until": "2025-10-20"
463
+ },
464
+ {
465
+ "name": "Responsible AI Badge",
466
+ "issuer": "Partnership on AI",
467
+ "date": "2024-10-25",
468
+ "valid_until": "2025-10-25"
469
+ }
470
+ ],
471
+ "regulatory_compliance": {
472
+ "gdpr": true,
473
+ "ccpa": true,
474
+ "coppa": true,
475
+ "eu_ai_act": "compliant",
476
+ "section_230": "compliant"
477
+ },
478
+ "ethical_review": {
479
+ "conducted": true,
480
+ "review_board": "Internal Ethics Committee",
481
+ "approval_date": "2024-10-01",
482
+ "next_review": "2025-04-01"
483
+ }
484
+ },
485
+
486
+ "emergency_protocols": {
487
+ "kill_switch": {
488
+ "enabled": true,
489
+ "trigger_conditions": [
490
+ "critical_safety_breach",
491
+ "widespread_misuse",
492
+ "legal_requirement",
493
+ "catastrophic_failure"
494
+ ],
495
+ "activation_authority": ["chief_safety_officer", "ceo", "legal_counsel"],
496
+ "response_time_seconds": 60
497
+ },
498
+
499
+ "rapid_response_team": {
500
+ "enabled": true,
501
+ "team_size": 8,
502
+ "availability": "24/7",
503
+ "response_procedures": true,
504
+ "communication_channels": ["email", "slack", "phone", "pager"]
505
+ },
506
+
507
+ "model_rollback": {
508
+ "enabled": true,
509
+ "trigger_threshold": "critical_safety_violations",
510
+ "rollback_to_version": "last_stable",
511
+ "data_preservation": true,
512
+ "user_notification": true
513
+ }
514
+ }
515
+ }