alfulanny commited on
Commit
04bceee
·
verified ·
1 Parent(s): 4374e5c

Update evaluate_and_submit.py

Browse files
Files changed (1) hide show
  1. evaluate_and_submit.py +103 -0
evaluate_and_submit.py CHANGED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Script to run the agent on Unit 4 questions and optionally submit results to the course scoring API.
3
+
4
+ Usage:
5
+ # Dry run (no submit) on first 5 questions:
6
+ python evaluate_and_submit.py --limit 5
7
+
8
+ # Submit results (requires username and Space URL):
9
+ python evaluate_and_submit.py --submit --username YOUR_HF_USERNAME \\
10
+ --agent-code-url https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE
11
+
12
+ Notes:
13
+ - The scoring API expects exact-match answers, so make sure the agent returns only
14
+ the final answer without extra formatting.
15
+ - Ensure you've run `huggingface-cli login` before submission.
16
+ """
17
+ import argparse
18
+ import os
19
+ import sys
20
+ import time
21
+ from typing import Any, Dict, List
22
+
23
+ from evaluation_client import ScoringAPIClient
24
+ from code_agent import run_agent
25
+
26
+
27
+ def extract_prompt_from_question(q: Dict[str, Any]) -> str:
28
+ """Extract the actual question/prompt from a question dict."""
29
+ for key in ("question", "prompt", "input", "text", "task"):
30
+ if key in q and isinstance(q[key], str):
31
+ return q[key]
32
+ return str(q)
33
+
34
+
35
+ def main(argv: List[str]):
36
+ parser = argparse.ArgumentParser(description="Evaluate agent on course questions and optionally submit.")
37
+ parser.add_argument("--limit", type=int, default=0, help="Max questions to process (0=all)")
38
+ parser.add_argument("--submit", action="store_true", help="Submit answers to scoring API")
39
+ parser.add_argument("--username", type=str, default=os.environ.get("HF_USERNAME"), help="HF username for submission")
40
+ parser.add_argument("--agent-code-url", type=str, default=os.environ.get("AGENT_CODE_URL"), help="Public Space URL for your agent")
41
+ args = parser.parse_args(argv)
42
+
43
+ client = ScoringAPIClient()
44
+ print("Fetching questions from scoring API...")
45
+ questions = client.get_questions()
46
+ if not questions:
47
+ print("ERROR: No questions returned by the API.")
48
+ sys.exit(1)
49
+
50
+ if args.limit > 0:
51
+ questions = questions[:args.limit]
52
+
53
+ print(f"Processing {len(questions)} questions...")
54
+ answers = []
55
+
56
+ for idx, q in enumerate(questions, 1):
57
+ task_id = q.get("task_id") or q.get("id") or q.get("taskId")
58
+ prompt = extract_prompt_from_question(q)
59
+
60
+ print(f"\n[{idx}/{len(questions)}] Task {task_id}")
61
+ print(f" Prompt: {prompt[:100]}...")
62
+
63
+ try:
64
+ ans = run_agent(prompt)
65
+ ans = ans.strip()
66
+ answers.append({"task_id": task_id, "submitted_answer": ans})
67
+ print(f" Answer: {ans[:80]}...")
68
+ except Exception as e:
69
+ print(f" ERROR: {type(e).__name__}: {str(e)[:100]}")
70
+ # Still add an error answer to maintain alignment
71
+ answers.append({"task_id": task_id, "submitted_answer": f"(error) {type(e).__name__}"})
72
+
73
+ # Polite pacing to avoid rate limits
74
+ time.sleep(0.5)
75
+
76
+ print(f"\n✓ Prepared answers for {len(answers)} tasks")
77
+
78
+ if args.submit:
79
+ if not args.username:
80
+ print("ERROR: --submit requires --username (or set HF_USERNAME env var)")
81
+ sys.exit(1)
82
+ if not args.agent_code_url:
83
+ print("ERROR: --submit requires --agent-code-url (or set AGENT_CODE_URL env var)")
84
+ sys.exit(1)
85
+
86
+ print(f"\nSubmitting {len(answers)} answers as user '{args.username}'...")
87
+ print(f"Agent Code URL: {args.agent_code_url}")
88
+
89
+ try:
90
+ resp = client.submit(username=args.username, agent_code=args.agent_code_url, answers=answers)
91
+ print(f"✓ Submission successful!")
92
+ print(f"Response: {resp}")
93
+ except Exception as e:
94
+ print(f"ERROR: Submission failed: {e}")
95
+ sys.exit(1)
96
+ else:
97
+ print("\nDry run complete. To submit, re-run with:")
98
+ print(f" python evaluate_and_submit.py --submit --username YOUR_USERNAME \\")
99
+ print(f" --agent-code-url https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE")
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main(sys.argv[1:])