Skip to content

Commit

Permalink
fix logs path in evaluation (#278)
Browse files Browse the repository at this point in the history
Co-authored-by: angrybayblade <vptl185@gmail.com>
  • Loading branch information
shubhras01 and angrybayblade committed Jul 9, 2024
1 parent 78525d6 commit d2d6aa7
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 20 deletions.
2 changes: 0 additions & 2 deletions python/swe/swe_bench_docker/templates/Dockerfile.swe
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,3 @@ COPY {{ path_to_reqs }} .
{% for cmd in install_cmds %}
RUN {{ cmd }}
{% endfor %}

ENTRYPOINT ["/bin/bash"]
29 changes: 11 additions & 18 deletions python/swe/swekit/benchmark/get_score_card.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import json
import logging
import os
from pathlib import Path

Expand All @@ -27,12 +26,7 @@
SCORECARDS_JSON_PATH = "scorecards.json"
RESULTS_JSON_PATH = "results.json"

# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.FileHandler("debug.log"), logging.StreamHandler()],
)
logger = get_logger(name="get_cur_eval_refs")


def format_report(report):
Expand All @@ -47,7 +41,6 @@ def format_report(report):


def get_cur_eval_refs(predictions_dir, swe_bench_path):
logger = get_logger(name="get_cur_eval_refs")
logger.info(
f"Getting eval refs for predictions_dir: {predictions_dir} and swe_bench_path: {swe_bench_path}"
)
Expand All @@ -64,28 +57,28 @@ def save_summaries_to_file(predictions_dir, predictions_path, log_dir, scorecard
path_scorecards = os.path.join(predictions_dir, SCORECARDS_JSON_PATH)
with open(path_scorecards, "w", encoding="utf-8") as f:
json.dump(scorecards, fp=f, indent=2)
logging.info("- Wrote per-instance scorecards to: %s", path_scorecards)
logger.info("- Wrote per-instance scorecards to: %s", path_scorecards)

# Get results and write to file
eval_refs_json_path = predictions_dir / Path(EVAL_REFS_JSON_PATH)
logging.info("Reference Report:")
logger.info("Reference Report:")
report = get_model_report(
MODEL_GPT4, str(predictions_path), str(eval_refs_json_path), str(log_dir)
)
for k, v in report.items():
logging.info("- %s: %s", k, len(v))
logger.info("- %s: %s", k, len(v))

results_path = predictions_dir / Path(RESULTS_JSON_PATH)
with open(results_path, "w", encoding="utf-8") as f:
json.dump(report, f, indent=2)
logging.info("- Wrote summary of run to: %s", results_path)
logger.info("- Wrote summary of run to: %s", results_path)


def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
logging.info("Starting main function")
logger.info("Starting main function")
eval_refs, _ = get_cur_eval_refs(predictions_dir, swe_bench_path)
predictions_path = predictions_dir / Path(PATH_PATCHES_JSON)
logging.debug("Predictions path: %s", predictions_path)
logger.debug("Predictions path: %s", predictions_path)

# Get predictions, define log_dir
# Iterate over each file in the directory
Expand All @@ -98,7 +91,7 @@ def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
if p[KEY_PREDICTION] is None or p[KEY_PREDICTION].strip() == "":
scorecard["statuses"].append("not_generated")
scorecards.append(scorecard)
logging.info(
logger.info(
"no prediction_key is found: %s. Skipping...", p[KEY_INSTANCE_ID]
)
continue
Expand All @@ -110,7 +103,7 @@ def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
if not os.path.exists(log_path):
scorecard["statuses"].append("build_failure")
scorecards.append(scorecard)
logging.info("no log file is found: %s. Skipping...", log_path)
logger.info("no log file is found: %s. Skipping...", log_path)
continue

# Get evaluation logs
Expand All @@ -119,7 +112,7 @@ def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
# Check that the prediction generated
if not found:
scorecards.append(scorecard)
logging.info("no eval_sm is found: %s. Skipping...", log_path)
logger.info("no eval_sm is found: %s. Skipping...", log_path)
continue
scorecard["statuses"].append("applied")

Expand Down Expand Up @@ -162,7 +155,7 @@ def generate_scorecard(predictions_dir, log_dir, swe_bench_path, model):
[f.removed for f in diff_obj]
)
except Exception as e:
logging.error(
logger.error(
"[%s] Error parsing prediction diff: %s", {p[KEY_INSTANCE_ID]}, e
)
scorecard["patch_files"] = []
Expand Down
2 changes: 2 additions & 0 deletions python/swe/swekit/benchmark/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ def evaluate(
generate_report: bool = True,
) -> None:
"""Evaluate a callable."""
if not os.path.exists(logs_dir):
os.makedirs(logs_dir)
manager = EvaluationManager(
EvaluationArgs(
test_range=test_range,
Expand Down

0 comments on commit d2d6aa7

Please sign in to comment.