| """ |
| Code BLEU metric implementation |
| """ |
|
|
| import datasets |
| import evaluate |
| from codebleu import calc_codebleu |
|
|
| CODEBLEU_WEIGHTS = (0.25, 0.25, 0.25, 0.25) |
|
|
| _CITATION = """\ |
| @misc{ren2020codebleu, |
| title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, |
| author={Shuo Ren and Daya Guo and Shuai Lu and Long Zhou and Shujie Liu and Duyu Tang and Neel Sundaresan and Ming Zhou and Ambrosio Blanco and Shuai Ma}, |
| year={2020}, |
| eprint={2009.10297}, |
| archivePrefix={arXiv}, |
| primaryClass={cs.SE} |
| } |
| """ |
|
|
| _DESCRIPTION = """ |
| An ideal evaluation metric should consider the grammatical correctness and the logic correctness. |
| We propose weighted n-gram match and syntactic AST match to measure grammatical correctness, and introduce semantic data-flow match to calculate logic correctness. |
| Source: https://pypi.org/project/codebleu/ |
| """ |
|
|
| _KWARGS_DESCRIPTION = """ |
| Computes CodeBLEU score of code segments against a reference. |
| Args: |
| predictions: list of code generations to score. |
| references: list of lists of or just a list of references for each code generation task. |
| Returns: |
| 'codebleu_score': code bleu score |
| Examples: |
| |
| >>> predictions = ["def add ( a , b ) :\n return a + b"] |
| >>> references = ["def sum ( first , second ) :\n return second + first"] |
| >>> codebleu = evaluate.load("codebleu_score") |
| >>> results = codebleu.compute(predictions=predictions, references=references) |
| >>> print(results["codebleu_score"]) |
| 0.5537 |
| """ |
|
|
| @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| class CodeBleu(evaluate.Metric): |
| def _info(self): |
| return evaluate.MetricInfo( |
| description=_DESCRIPTION, |
| citation=_CITATION, |
| inputs_description=_KWARGS_DESCRIPTION, |
| features=[ |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"), |
| } |
| ), |
| datasets.Features( |
| { |
| "predictions": datasets.Value("string", id="sequence"), |
| "references": datasets.Value("string", id="sequence"), |
| } |
| ), |
| ], |
| codebase_urls=["https://github.com/microsoft/CodeXGLUE/tree/main"], |
| reference_urls=[ |
| "https://pypi.org/project/codebleu/", |
| ], |
| ) |
|
|
| def compute_codebleu_score(self, ground_truth, generated_answer, lang="python"): |
| """ |
| Function to compute CodeBLEU score between ground truth code and generated code |
| Has keywords for C, C#, C++, Go, Java, JavaScript, PHP, Python, Ruby, and Rust. |
| """ |
| result = calc_codebleu([ground_truth], [generated_answer], lang=lang, weights=CODEBLEU_WEIGHTS, tokenizer=None) |
|
|
| return result["codebleu"] |
|
|
| def _compute(self, references, predictions): |
| average_codebleu_score = sum([self.compute_codebleu_score(r, p) for r, p in zip(references, predictions)])/len(references) |
| return {"codebleu_score": average_codebleu_score} |
|
|