From 93d0f18e8c5d69e981c02afca98853681129a201 Mon Sep 17 00:00:00 2001 From: pltrdy Date: Wed, 15 Mar 2017 19:39:32 +0100 Subject: [PATCH] Introducing ROUGE: A full Python Implementation of the ROUGE Metric We provide a useful and fast module for ROUGE scoring as well as a command to use it directly from the shell. --- .gitignore | 3 + README.md | 123 +++++++++++++ bin/.rouge_cmd.py.swp | Bin 0 -> 12288 bytes bin/__init__.py | 0 bin/rouge_cmd.py | 36 ++++ rouge/__init__.py | 3 + rouge/rouge.py | 135 ++++++++++++++ rouge/rouge_score.py | 381 +++++++++++++++++++++++++++++++++++++++ setup.py | 30 +++ tests/.test_basic.py.swp | Bin 0 -> 12288 bytes tests/data.json | 84 +++++++++ tests/hyp.txt | 4 + tests/ref.txt | 4 + tests/setup.cfg | 2 + tests/test_basic.py | 37 ++++ 15 files changed, 842 insertions(+) create mode 100644 README.md create mode 100644 bin/.rouge_cmd.py.swp create mode 100644 bin/__init__.py create mode 100755 bin/rouge_cmd.py create mode 100644 rouge/__init__.py create mode 100644 rouge/rouge.py create mode 100644 rouge/rouge_score.py create mode 100644 setup.py create mode 100644 tests/.test_basic.py.swp create mode 100644 tests/data.json create mode 100644 tests/hyp.txt create mode 100644 tests/ref.txt create mode 100644 tests/setup.cfg create mode 100644 tests/test_basic.py diff --git a/.gitignore b/.gitignore index 72364f9..2962cab 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# swap files +*.swp + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md new file mode 100644 index 0000000..3737148 --- /dev/null +++ b/README.md @@ -0,0 +1,123 @@ +# Rouge +*A full Python librarie for the ROUGE metric [(paper)](http://www.aclweb.org/anthology/W04-1013).* + +## Quickstart +#### Clone & Install +```shell +git clone https://github.com/pltrdy/pyrouge +cd pyrouge +sudo python3 setup.py install +``` +or from pip: +``` +sudo pip3 install rouge +``` +#### Use it from the shell (JSON Output) +``` +$rouge -h +usage: rouge [-h] [-f] [-a] hypothesis reference + +Rouge Metric Calculator + +positional arguments: + hypothesis Text of file path + reference Text or file path + +optional arguments: + -h, --help show this help message and exit + -f, --file File mode + -a, --avg Average mode + +``` + +e.g. + + +```shell +# Single Sentence +rouge "transcript is a written version of each day 's cnn student" \ + ""this page includes the show transcript use the transcript to help students with" + +# Scoring using two files (line by line) +rouge -f ./tests/hyp.txt ./ref.txt +``` + +#### As a library + +###### Score 1 sentence + +```python +from rouge import Rouge + +hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news" + +reference = "this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news" + +rouge = Rouge() +scores = rouge.get_scores() +``` + +*Output:* + +```json +{ + "rouge-1": { + "f": 0.5238095189484127, + "p": 0.6285714285714286, + "r": 0.4489795918367347 + }, + "rouge-2": { + "f": 0.27027026566025497, + "p": 0.375, + "r": 0.2112676056338028 + }, + "rouge-l": { + "f": 0.28711800978275975, + "p": 0.4418604651162791, + "r": 0.25675675675675674 + } +} +``` + +###### Score multiple sentences +```python +import json +from rouge import Rouge + +# Load some sentences +with open('./tests/data.json') as f: + data = json.load(f) + +hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data])) +rouge = Rouge() +scores = rouge.get_scores(hyps, refs) +# or +scores = rouge.get_scores(hyps, refs, avg=True) +``` + +*Output (`avg=False`)*: a list of `n` dicts: + +``` +{"rouge-1": {"f": _, "p": _, "r": _}, "rouge-2" : { .. }, "rouge-3": { ... }} +``` + + +*Output (`avg=True`)*: a single dict with average values: + +``` +{"rouge-1": {"f": _, "p": _, "r": _}, "rouge-2" : { ..     }, "rouge-3": { ... }} +``` + +###### Score two files (line by line) +Given two files `hyp_path`, `ref_path`, with the same number (`n`) of lines, calculate score for each of this lines, or, the average over the whole file. + +```python +from rouge import FilesRouge + +files_rouge = FilesRouge(hyp_path, ref_path) +scores = files_rouge.get_scores() +# or +scores = files_rouge.get_scores(avg=True) +``` + +**Note** that you can avoid consuming too much memory by using `batch_line=l`. This way, the script will read only `l` lines at a time. (otherwise it loads the whole files). diff --git a/bin/.rouge_cmd.py.swp b/bin/.rouge_cmd.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..15a07631112c81b691c3a7a7e7bfd858f05f0d9c GIT binary patch literal 12288 zcmeI2O>5LZ7{{kQiM1-$i?``kNLF{#S5yi)6rtduAVtMnNR!EKWb?wzq_`f`PvF6e z;6cBEC-LG%@Fw^HJa`fcMf?u_XOgXbv9wYU3QyqQOy=c3&+~gSho#w=dvIrs-}IIk zu8WL){Pg9ab7qTuxWZUgD4|UM5tp6UE31Vg<~_9&XJt|K#Oy|z*R3GRVy_$?v6hC3 z01-F=fl4RcTa#>YVSdiI%(SNY)TR3;Fr=kKfCvx)B0vO)01+SpM1TmKHUhdFW6u!u zcoX>5=03LX-aOKc2oM1xKm>>Y5g-CYfCvx)B0vO)01-Ha1j3lH`3c6pL+1bgqu>AA z7a02neT62Km>>Y5g-CYfCvzQ|4M-KEKGB^&C-MiK^|rz z2)OU_RuGthAZRT!#&MN0&9(bT6}cBzS*ctV6;h}UPxDyh+Fxk5>xp$g_qqLu*Xs!# z)Lm|WSa!G+Nr#7+YSFc(Ig!R6&y(SV3NBgBB)iftj>27 zYJ;#zt59AU>b{pMv(RS0C+&UX20%SNvO8AhD7>X%_Ams@0~VH{1b761TVVQ76pS|A zVH}&zDnsdAX9uLy;igbN3e@3!F(`fKo_MTzA-S=~YTb8CjDNNHQ=mhAlH>SvL#%hY zT@(Nf8%AqoQ^>FxkjfIM2TGo-+7lE${Ez^kK4hK;+2oanOE6vJ6YEv2O!(x#%#}l%O4`rvMZU?)q3##?BKr-gbdf^< literal 0 HcmV?d00001 diff --git a/bin/__init__.py b/bin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bin/rouge_cmd.py b/bin/rouge_cmd.py new file mode 100755 index 0000000..5c45178 --- /dev/null +++ b/bin/rouge_cmd.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +from rouge import Rouge, FilesRouge + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Rouge Metric Calculator') + parser.add_argument('-f', '--file', help="File mode", action='store_true') + parser.add_argument('-a', '--avg', help="Average mode", action='store_true') + parser.add_argument('hypothesis', type=str, help='Text of file path') + parser.add_argument('reference', type=str, help='Text or file path') + + args = parser.parse_args() + if args.file: + hyp, ref = args.hypothesis, args.reference + assert(os.path.isfile(hyp)) + assert(os.path.isfile(ref)) + + files_rouge = FilesRouge(hyp, ref) + scores = files_rouge.get_scores(avg=args.avg) + + print(json.dumps(scores, indent=2)) + else: + hyp, ref = args.hypothesis, args.reference + assert(type(hyp) == str) + assert(type(ref) == str) + + rouge = Rouge() + scores = rouge.get_scores(hyp, ref, avg=args.avg) + + print(json.dumps(scores, indent=2)) + +if __name__ == "__main__": + main() diff --git a/rouge/__init__.py b/rouge/__init__.py new file mode 100644 index 0000000..75e2265 --- /dev/null +++ b/rouge/__init__.py @@ -0,0 +1,3 @@ +from rouge.rouge import FilesRouge, Rouge + +__version__ = "0.2" diff --git a/rouge/rouge.py b/rouge/rouge.py new file mode 100644 index 0000000..84c9f38 --- /dev/null +++ b/rouge/rouge.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +import rouge.rouge_score as rouge_score +import os +import numpy as np + +class FilesRouge: + def __init__(self, hyp_path, ref_path, metrics=None, stats=None, batch_lines=None): + assert(os.path.isfile(hyp_path)) + assert(os.path.isfile(ref_path)) + + self.rouge = Rouge(metrics=metrics, stats=stats) + + def line_count(path): + count = 0 + for line in open(path): + count += 1 + return count + + hyp_lc = line_count(hyp_path) + ref_lc = line_count(ref_path) + assert(hyp_lc == ref_lc) + + assert(batch_lines is None or type(batch_lines) == int) + + self.hyp_path = hyp_path + self.ref_path = ref_path + self.batch_lines = batch_lines + + def get_scores(self, avg=False): + """Calculate ROUGE scores between each pair of + lines (hyp_file[i], ref_file[i]). + Args: + * hyp_path: hypothesis file path + * ref_path: references file path + * avg (False): whether to get an average scores or a list + * batch_line(None): set it to an integer value to work with + subsets of `batch_line` lines (uses less memory) + """ + batch_lines = self.batch_lines + hyp_path, ref_path = self.hyp_path, self.ref_path + + if batch_lines is None: + hyps = [line[:-1] for line in open(hyp_path).readlines()] + refs = [line[:-1] for line in open(ref_path).readlines()] + + + return self.rouge.get_scores(hyps, refs, avg=avg) + + else: + if batch_lines > hyp_lc: + batch_lines = hyp_lc + + if avg: + sc = [0, 0, 0] + update_scores = lambda s, h, r: [sum(x) for x in zip(s, self.rouge.get_scores(h, r, avg=True))] + else: + sc = [] + update_scores = lambda s, h, r: s + self.rouge.get_scores(batch_hyp, batch_ref) + + hyp_file = open(hyp_path) + ref_file = open(ref_path) + + batch_hyp = [] + batch_ref = [] + + for count in range(hyp_lc): + batch_hyp.append(hyp_file.readline()[:-1]) + batch_ref.append(ref_file.readline()[:-1]) + + count += 1 + if count == batch_lines: + sc = update_scores(sc, batch_hyp, batch_ref) + count = 0 + batch_hyp = [] + batch_ref = [] + + if avg: + return [s/hyp_lc for s in sc] + return sc + + +class Rouge: + DEFAULT_METRICS = ["rouge-1", "rouge-2", "rouge-l"] + AVAILABLE_METRICS = {"rouge-1": lambda hyp, ref: rouge_score.rouge_n([hyp], [ref], 1), + "rouge-2": lambda hyp, ref: rouge_score.rouge_n([hyp], [ref], 2), + "rouge-l": lambda hyp, ref: rouge_score.rouge_l_sentence_level([hyp], [ref]), + } + + DEFAULT_STATS = ["f", "p", "r"] + AVAILABLE_STATS = {"f": 0, "p": 1, "r": 2 + } + def __init__(self, metrics=None, stats=None): + self.metrics = metrics if metrics is not None else Rouge.DEFAULT_METRICS + self.stats = stats if stats is not None else Rouge.DEFAULT_STATS + + for m in self.metrics: + if m not in Rouge.AVAILABLE_METRICS: + raise ValueError("Unknown metric '%s'" % m) + + for s in self.stats: + if s not in Rouge.AVAILABLE_STATS: + raise ValueError("Unknown stat '%s'" % s) + + def get_scores(self, hyps, refs, avg=False): + if type(hyps) == str: + hyps, refs = [hyps], [refs] + + assert(type(hyps) == type(refs)) + assert(len(hyps) == len(refs)) + + if not avg: + return self._get_scores(hyps, refs) + return self._get_avg_scores(hyps, refs) + + def _get_scores(self, hyps, refs): + scores = [] + for hyp, ref in zip(hyps, refs): + sen_score = {} + for m in self.metrics: + fn = Rouge.AVAILABLE_METRICS[m] + sc = fn(hyp, ref) + sen_score[m] = {s: sc[Rouge.AVAILABLE_STATS[s]] for s in self.stats} + scores.append(sen_score) + return scores + + def _get_avg_scores(self, hyps, refs): + scores = {} + for m in self.metrics: + fn = Rouge.AVAILABLE_METRICS[m] + sc = [fn(hyp, ref) for hyp, ref in zip(hyps, refs)] + sc = [[sen_sc[Rouge.AVAILABLE_STATS[s]] for s in self.stats] for sen_sc in sc] + scores[m] = {s: st for s, st in zip(self.stats, tuple(map(np.mean, zip(*sc))))} + return scores + + diff --git a/rouge/rouge_score.py b/rouge/rouge_score.py new file mode 100644 index 0000000..8c8715b --- /dev/null +++ b/rouge/rouge_score.py @@ -0,0 +1,381 @@ +# -*- coding: utf-8 -*- +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ROUGE Metric Implementation + +This is a very slightly version of: +https://github.com/pltrdy/seq2seq/blob/master/seq2seq/metrics/rouge.py + +--- + +ROUGe metric implementation. + +This is a modified and slightly extended verison of +https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. +""" +from __future__ import absolute_import +from __future__ import division, print_function, unicode_literals + +import itertools +import numpy as np + +#pylint: disable=C0103 + + +def _get_ngrams(n, text): + """Calcualtes n-grams. + + Args: + n: which n-grams to calculate + text: An array of tokens + + Returns: + A set of n-grams + """ + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + + +def _split_into_words(sentences): + """Splits multiple sentences into words and flattens the result""" + return list(itertools.chain(*[_.split(" ") for _ in sentences])) + + +def _get_word_ngrams(n, sentences): + """Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = _split_into_words(sentences) + return _get_ngrams(n, words) + + +def _len_lcs(x, y): + """ + Returns the length of the Longest Common Subsequence between sequences x + and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: sequence of words + y: sequence of words + + Returns + integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = len(x), len(y) + return table[n, m] + + +def _lcs(x, y): + """ + Computes the length of the longest common subsequence (lcs) between two + strings. The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: collection of words + y: collection of words + + Returns: + Table of dictionary of coord and len lcs + """ + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table + + +def _recon_lcs(x, y): + """ + Returns the Longest Subsequence between x and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + + Args: + x: sequence of words + y: sequence of words + + Returns: + sequence: LCS of x and y + """ + i, j = len(x), len(y) + table = _lcs(x, y) + + def _recon(i, j): + """private recon calculation""" + if i == 0 or j == 0: + return [] + elif x[i - 1] == y[j - 1]: + return _recon(i - 1, j - 1) + [(x[i - 1], i)] + elif table[i - 1, j] > table[i, j - 1]: + return _recon(i - 1, j) + else: + return _recon(i, j - 1) + + recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) + return recon_tuple + + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + """ + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + n: Size of ngram. Defaults to 2. + + Returns: + A tuple (f1, precision, recall) for ROUGE-N + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score, precision, recall + + +def _f_p_r_lcs(llcs, m, n): + """ + Computes the LCS-based F-measure score + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Args: + llcs: Length of LCS + m: number of words in reference summary + n: number of words in candidate summary + + Returns: + Float. LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / (r_lcs + 1e-12) + num = (1 + (beta**2)) * r_lcs * p_lcs + denom = r_lcs + ((beta**2) * p_lcs) + f_lcs = num / (denom + 1e-12) + return f_lcs, p_lcs, r_lcs + + +def rouge_l_sentence_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (sentence level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + + Returns: + A float: F_lcs + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + reference_words = _split_into_words(reference_sentences) + evaluated_words = _split_into_words(evaluated_sentences) + m = len(reference_words) + n = len(evaluated_words) + lcs = _len_lcs(evaluated_words, reference_words) + return _f_p_r_lcs(lcs, m, n) + + +def _union_lcs(evaluated_sentences, reference_sentence): + """ + Returns LCS_u(r_i, C) which is the LCS score of the union longest common + subsequence between reference sentence ri and candidate summary C. For example + if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + “w1 w2” and the longest common subsequence of r_i and c2 is “w1 w3 w5”. The + union longest common subsequence of r_i, c1, and c2 is “w1 w2 w3 w5” and + LCS_u(r_i, C) = 4/5. + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + + Returns: + float: LCS_u(r_i, C) + + ValueError: + Raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + lcs_union = set() + reference_words = _split_into_words([reference_sentence]) + combined_lcs_length = 0 + for eval_s in evaluated_sentences: + evaluated_words = _split_into_words([eval_s]) + lcs = set(_recon_lcs(reference_words, evaluated_words)) + combined_lcs_length += len(lcs) + lcs_union = lcs_union.union(lcs) + + union_lcs_count = len(lcs_union) + union_lcs_value = union_lcs_count / combined_lcs_length + return union_lcs_value + + +def rouge_l_summary_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (summary level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + + Calculated according to: + R_lcs = SUM(1, u)[LCS(r_i,C)]/m + P_lcs = SUM(1, u)[LCS(r_i,C)]/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + + where: + SUM(i,u) = SUM from i through u + u = number of sentences in reference summary + C = Candidate summary made up of v sentences + m = number of words in reference summary + n = number of words in candidate summary + + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + + Returns: + A float: F_lcs + + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + # total number of words in reference sentences + m = len(_split_into_words(reference_sentences)) + + # total number of words in evaluated sentences + n = len(_split_into_words(evaluated_sentences)) + + union_lcs_sum_across_all_references = 0 + for ref_s in reference_sentences: + union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, + ref_s) + return _f_r_p_lcs(union_lcs_sum_across_all_references, m, n) + + +def rouge(hypotheses, references): + """Calculates rouge scores for a list of hypotheses and + references + + Args: + * hypotheses: a list of n sentences + * references: a list of n sentences + + Returns: + * rouge-1, rouge-2, rouge-l: list of n tuple (f-measure, precision, recall) + """ + + # Filter out hyps that are of 0 length + hyps_and_refs = zip(hypotheses, references) + hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0] + hypotheses, references = zip(*hyps_and_refs) + + # Calculate ROUGE-1 F1, precision, recall scores + rouge_1 = [ + rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references) + ] + + # Calculate ROUGE-2 F1, precision, recall scores + rouge_2 = [ + rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references) + ] + + # Calculate ROUGE-L F1, precision, recall scores + rouge_l = [ + rouge_l_sentence_level([hyp], [ref]) + for hyp, ref in zip(hypotheses, references) + ] + + return rouge_1, rouge_2, rouge_l + +def avg_rouge(hypotheses, references): + """Calculates average rouge scores for a list of hypotheses and + references + + + """ + + rouge_1, rouge_2, rouge_l = rouge(hypotheses, references) + + avg_rouge_1 = tuple(map(np.mean, zip(*rouge_1))) + avg_rouge_2 = tuple(map(np.mean, zip(*rouge_2))) + avg_rouge_l = tuple(map(np.mean, zip(*rouge_l))) + + + return avg_rouge_1, avg_rouge_2, avg_rouge_l + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6af7c79 --- /dev/null +++ b/setup.py @@ -0,0 +1,30 @@ +from setuptools import setup, find_packages +import rouge + +version = rouge.__version__ +setup( + name="rouge", + version=version, + description="Full Python ROUGE Score Implementation (not a wrapper)", + url="http://github.com/pltrdy/rouge", + download_url="https://github.com/pltrdy/rouge/archive/%s.tar.gz" % version, + author="pltrdy", + author_email="pltrdy@gmail.com", + keywords=["NL", "CL", "natural language processing", "computational linguistics", "summarization"], + packages=find_packages(), + classifiers=[ + "Intended Audience :: Science/Research", + "Programming Language :: Python :: 3", + "Topic :: Text Processing :: Linguistic" + ], + license="LICENCE.txt", + long_description=open("README.md").read(), + test_suite="nose.collector", + tests_require=['nose'], + + entry_points={ + 'console_scripts': [ + 'rouge=bin.rouge_cmd:main' + ] + } +) diff --git a/tests/.test_basic.py.swp b/tests/.test_basic.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..f04d4f3dcc148dd8373d9d11478661336b64d4c5 GIT binary patch literal 12288 zcmeI2&ui2`6vwCD{81IX%eW%R$~N7$f>Q9JD5!`aR*+>`(#@oeWRo;Ai>-%x_Als3 z|A2yL|AKn*RXsIPSGQa-0Vco%m;e)C0!)AjFaajO1egF5U;-zQKo|+}^NbK%XAwO9|F6IQ ze_RmaJM^@;@W|1d?%1Ehs*?+025#WOn?b60Vco%jv^pk(JGCDP-|71 z`)k86_0@Wz`bI@P-Oo#hVHdU8X}e{Xw8SunCo~#$C!o1bG^)nf&S0EZwe?=< zJPXo1jQrTyyiUxj%nOyd3CYY&Aww-=N~i)pdV)ghz&{ta#N3G0KQLleL#czSUODE)? airport , the coast guard says the plane was carrying ### passengers and ## crew members the plane was en route from atlanta to the dominican republic", + "ref": "delta air lines flight #### skidded into a fence last week at a laguardia airport beset by winter weather the ntsb says the crew reported they did not sense any deceleration from the wheel brake upon landing", + "scores": { + "rouge-1": { + "f": 0.2295081917871541, + "p": 0.25925925925925924, + "r": 0.20588235294117646 + }, + "rouge-2": { + "f": 0.028985502255829465, + "p": 0.030303030303030304, + "r": 0.027777777777777776 + }, + "rouge-l": { + "f": 0.23991471215299215, + "p": 0.23684210526315788, + "r": 0.24324324324324326 + } + } +}] diff --git a/tests/hyp.txt b/tests/hyp.txt new file mode 100644 index 0000000..cf78dfd --- /dev/null +++ b/tests/hyp.txt @@ -0,0 +1,4 @@ +the #### transcript is a written version of each day 's cnn student news program use this transcript to help students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news +a u.s. citizen was killed in a a shootout in mississippi in #### he was shot in the head and died in a bath tub in omaha , louisiana authorities are investigating the death", +nelson mandela is a women 's advocate for women , nelson mandela says nelson mandela is a women 's advocate for women she says women do n't know how women are women +the captain of the delta flight was en route to airport , the coast guard says the plane was carrying ### passengers and ## crew members the plane was en route from atlanta to the dominican republic diff --git a/tests/ref.txt b/tests/ref.txt new file mode 100644 index 0000000..aef0f9a --- /dev/null +++ b/tests/ref.txt @@ -0,0 +1,4 @@ +this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teacher or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news +the fugitive who killed the marshal was " extremely dangerous , " u.s. marshals service director says deputy u.s. marshal josie wells , ## , died after trying to arrest jamie croom " before he 'd go back to jail , he said , he 'd rather be dead, " croom 's sister says +cnn 's kelly wallace wonders why women too often do n't lift each up in the workplace author of " the woman code " says women need to start operating like the boys women need to realize they win when they help other women get ahead , says author +delta air lines flight #### skidded into a fence last week at a laguardia airport beset by winter weather the ntsb says the crew reported they did not sense any deceleration from the wheel brake upon landing diff --git a/tests/setup.cfg b/tests/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/tests/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..b601622 --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,37 @@ +from unittest import TestCase + +import rouge +import json + +class BasicTest(TestCase): + def setUp(self): + self.hyp_path = './tests/hyp.txt' + self.ref_path = './tests/ref.txt' + + self.data_path = './tests/data.json' + with open(self.data_path) as f: + self.data = json.load(f) + + self.rouge = rouge.Rouge() + self.files_rouge = rouge.FilesRouge(self.hyp_path, self.ref_path) + + def test_one_sentence(self): + for d in self.data[:1]: + hyp = d["hyp"] + ref = d["ref"] + score = self.rouge.get_scores(hyp, ref)[0] + self.assertEqual(score, d["scores"]) + + def test_multi_sentence(self): + data = self.data + hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data])) + expected_scores = [d['scores'] for d in data] + scores = self.rouge.get_scores(hyps, refs) + self.assertEqual(expected_scores, scores) + + def test_files_scores(self): + data = self.data + hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data])) + expected_scores = [d['scores'] for d in data] + scores = self.files_rouge.get_scores() + self.assertEqual(expected_scores, scores)