import requests
from typing import List, Dict, Any
import re
from dataclasses import dataclass
@dataclass
class PromptComplianceResult:
template_id: str
template_name: str
compliance_score: float
passed_checks: List[str]
failed_checks: List[str]
recommendations: List[str]
class PromptComplianceValidator:
def __init__(self, flow_name: str, api_token: str):
self.flow_name = flow_name
self.api_token = api_token
self.base_url = f"https://{flow_name}.flows.graphorlm.com"
# Define compliance criteria
self.compliance_criteria = {
"context_integration": {
"check": lambda text: "{context}" in text,
"weight": 0.25,
"description": "Template includes context placeholder for RAG functionality"
},
"structured_instructions": {
"check": lambda text: any(keyword in text.lower() for keyword in
["guidelines", "instructions", "rules", "criteria"]),
"weight": 0.20,
"description": "Template provides structured behavioral guidelines"
},
"anti_hallucination": {
"check": lambda text: any(phrase in text.lower() for phrase in
["do not answer from your knowledge", "use the context",
"based on the context", "avoid hallucination"]),
"weight": 0.20,
"description": "Template includes anti-hallucination instructions"
},
"error_handling": {
"check": lambda text: any(phrase in text.lower() for phrase in
["unknown", "don't know", "clarification", "not available"]),
"weight": 0.15,
"description": "Template handles unknown or unclear queries"
},
"response_structure": {
"check": lambda text: any(keyword in text.lower() for keyword in
["format", "structure", "organize", "sections"]),
"weight": 0.10,
"description": "Template provides response formatting guidance"
},
"appropriate_length": {
"check": lambda text: 100 <= len(text) <= 2000,
"weight": 0.10,
"description": "Template length is appropriate (100-2000 characters)"
}
}
def get_prompts(self) -> List[Dict[str, Any]]:
"""Retrieve all prompts from the flow"""
response = requests.get(
f"{self.base_url}/prompts",
headers={"Authorization": f"Bearer {self.api_token}"}
)
response.raise_for_status()
return response.json()
def validate_prompt_compliance(self, prompt: Dict[str, Any]) -> PromptComplianceResult:
"""Validate a single prompt against compliance criteria"""
template_id = prompt.get("id", "")
template_name = prompt.get("name", "Unnamed")
template_text = prompt.get("text", "")
passed_checks = []
failed_checks = []
compliance_score = 0.0
recommendations = []
for criterion_name, criterion in self.compliance_criteria.items():
if criterion["check"](template_text):
passed_checks.append(criterion_name)
compliance_score += criterion["weight"]
else:
failed_checks.append(criterion_name)
recommendations.append(f"Add {criterion['description'].lower()}")
# Additional specific recommendations
if "context_integration" in failed_checks:
recommendations.append("Include '{context}' placeholder in your template")
if "anti_hallucination" in failed_checks:
recommendations.append("Add instruction to use only provided context, not LLM knowledge")
if len(template_text) < 100:
recommendations.append("Expand template with more detailed instructions")
elif len(template_text) > 2000:
recommendations.append("Consider simplifying template for better clarity")
return PromptComplianceResult(
template_id=template_id,
template_name=template_name,
compliance_score=compliance_score,
passed_checks=passed_checks,
failed_checks=failed_checks,
recommendations=recommendations
)
def validate_all_prompts(self) -> Dict[str, Any]:
"""Validate all prompts and generate compliance report"""
prompts = self.get_prompts()
validation_results = []
total_score = 0.0
for prompt in prompts:
result = self.validate_prompt_compliance(prompt)
validation_results.append(result)
total_score += result.compliance_score
avg_compliance = total_score / len(prompts) if prompts else 0
# Categorize results
excellent = [r for r in validation_results if r.compliance_score >= 0.8]
good = [r for r in validation_results if 0.6 <= r.compliance_score < 0.8]
needs_improvement = [r for r in validation_results if r.compliance_score < 0.6]
return {
"summary": {
"total_prompts": len(prompts),
"average_compliance": avg_compliance,
"excellent_count": len(excellent),
"good_count": len(good),
"needs_improvement_count": len(needs_improvement)
},
"results": validation_results,
"excellent": excellent,
"good": good,
"needs_improvement": needs_improvement
}
def print_compliance_report(self, report: Dict[str, Any]):
"""Print formatted compliance report"""
summary = report["summary"]
print("🔍 Prompt Template Compliance Report")
print("=" * 50)
print(f"Flow: {self.flow_name}")
print(f"Total Templates: {summary['total_prompts']}")
print(f"Average Compliance Score: {summary['average_compliance']:.2f}/1.0")
print(f"\n📊 Compliance Distribution:")
print(f" 🟢 Excellent (≥0.8): {summary['excellent_count']} templates")
print(f" 🟡 Good (0.6-0.79): {summary['good_count']} templates")
print(f" 🔴 Needs Improvement (<0.6): {summary['needs_improvement_count']} templates")
# Detail each template
print(f"\n📋 Template Details:")
print("-" * 40)
for result in report["results"]:
score_icon = "🟢" if result.compliance_score >= 0.8 else \
"🟡" if result.compliance_score >= 0.6 else "🔴"
print(f"\n{score_icon} {result.template_name}")
print(f" ID: {result.template_id}")
print(f" Compliance Score: {result.compliance_score:.2f}/1.0")
if result.passed_checks:
print(f" ✅ Passed: {', '.join(result.passed_checks)}")
if result.failed_checks:
print(f" ❌ Failed: {', '.join(result.failed_checks)}")
if result.recommendations:
print(f" 💡 Recommendations:")
for rec in result.recommendations:
print(f" - {rec}")
# Global recommendations
if summary['needs_improvement_count'] > 0:
print(f"\n🎯 Priority Actions:")
print(f" - Focus on templates with compliance score < 0.6")
print(f" - Ensure all templates include context integration")
print(f" - Add anti-hallucination instructions to maintain accuracy")
# Usage
validator = PromptComplianceValidator("my-rag-pipeline", "YOUR_API_TOKEN")
try:
report = validator.validate_all_prompts()
validator.print_compliance_report(report)
except Exception as e:
print(f"Compliance validation failed: {e}")