Skip to content
This repository has been archived by the owner on Jul 4, 2023. It is now read-only.

Added Rogue Metric #113

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 2 additions & 17 deletions README.md
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# PyTorch-NLP
Basic Utilities for PyTorch Natural Language Processing (NLP)
<p align="center"><img width="55%" src="docs/_static/img/logo.svg" /></p>

<h3 align="center">Basic Utilities for PyTorch Natural Language Processing (NLP)</h3>
Expand Down Expand Up @@ -45,7 +47,6 @@ Load the IMDB dataset, for example:

```python
from torchnlp.datasets import imdb_dataset

# Load the imdb training dataset
train = imdb_dataset(train=True)
train[0] # RETURNS: {'text': 'For a movie that gets..', 'sentiment': 'pos'}
Expand All @@ -55,17 +56,13 @@ Load a custom dataset, for example:

```python
from pathlib import Path

from torchnlp.download import download_file_maybe_extract

directory_path = Path('data/')
train_file_path = Path('trees/train.txt')

download_file_maybe_extract(
url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
directory=directory_path,
check_files=[train_file_path])

open(directory_path / train_file_path)
```

Expand All @@ -80,7 +77,6 @@ text into tokens whenever it encounters a whitespace character.

```python
from torchnlp.encoders.text import WhitespaceEncoder

loaded_data = ["now this ain't funny", "so don't you dare laugh"]
encoder = WhitespaceEncoder(loaded_data)
encoded_data = [encoder.encode(example) for example in loaded_data]
Expand All @@ -95,13 +91,10 @@ import torch
from torchnlp.samplers import BucketBatchSampler
from torchnlp.utils import collate_tensors
from torchnlp.encoders.text import stack_and_pad_tensors

encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]

train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
train_batch_sampler = BucketBatchSampler(
train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0])

batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
batches = [collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches]
```
Expand All @@ -128,9 +121,7 @@ Wrap any code that's random, with `fork_rng` and you'll be good to go, like so:
import random
import numpy
import torch

from torchnlp.random import fork_rng

with fork_rng(seed=123): # Ensure determinism
print('Random:', random.randint(1, 2**31))
print('Numpy:', numpy.random.randint(1, 2**31))
Expand All @@ -154,9 +145,7 @@ pre-trained word vectors to set your embeddings, like so:
import torch
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.word_to_vector import GloVe

encoder = WhitespaceEncoder(["now this ain't funny", "so don't you dare laugh"])

vocab_set = set(encoder.vocab)
pretrained_embedding = GloVe(name='6B', dim=100, is_include=lambda w: w in vocab_set)
embedding_weights = torch.Tensor(encoder.vocab_size, pretrained_embedding.dim)
Expand All @@ -171,10 +160,8 @@ For example, from the neural network package, apply the state-of-the-art `Locked
```python
import torch
from torchnlp.nn import LockedDropout

input_ = torch.randn(6, 3, 10)
dropout = LockedDropout(0.5)

# Apply a LockedDropout to `input_`
dropout(input_) # RETURNS: torch.FloatTensor (6x3x10)
```
Expand All @@ -185,10 +172,8 @@ Compute common NLP metrics such as the BLEU score.

```python
from torchnlp.metrics import get_moses_multi_bleu

hypotheses = ["The brown fox jumps over the dog 笑"]
references = ["The quick brown fox jumps over the lazy dog 笑"]

# Compute BLEU score with the official BLEU perl script
get_moses_multi_bleu(hypotheses, references, lowercase=True) # RETURNS: 47.9
```
Expand Down
112 changes: 112 additions & 0 deletions torchnlp/metrics/rouge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@


def _get_ngrams(n, text):

ngram_set = set()
text_length = len(text)
max_index_ngram_start = text_length - n
for i in range(max_index_ngram_start + 1):
ngram_set.add(tuple(text[i:i + n]))
return ngram_set


def _get_word_ngrams(n, sentences):
"""Calculates word n-grams for multiple sentences.
"""
assert len(sentences) > 0
assert n > 0

words = split_into_words(sentences)
return _get_ngrams(n, words)


def rouge_n(evaluated_sentences, reference_sentences, n=2):

if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
raise ValueError("Collections must contain at least 1 sentence.")

evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
reference_ngrams = _get_word_ngrams(n, reference_sentences)
reference_count = len(reference_ngrams)
evaluated_count = len(evaluated_ngrams)

# Gets the overlapping ngrams between evaluated and reference
overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
overlapping_count = len(overlapping_ngrams)

# Handle edge case. This isn't mathematically correct, but it's good enough
if evaluated_count == 0:
precision = 0.0
else:
precision = overlapping_count / evaluated_count

if reference_count == 0:
recall = 0.0
else:
recall = overlapping_count / reference_count

f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))

# return overlapping_count / reference_count
return f1_score


def len_lcs(x, y):

n, m = len(x), len(y)
table = dict()
for i in range(n + 1):
for j in range(m + 1):
if i == 0 or j == 0:
table[i, j] = 0
elif x[i - 1] == y[j - 1]:
table[i, j] = table[i - 1, j - 1] + 1
else:
table[i, j] = max(table[i - 1, j], table[i, j - 1])
return table[n, m]


def split_into_words(sentences):
"""Splits multiple sentences into words and flattens the result"""
return list(sentences.split(" "))


def rogue_l(candidate, references):
lcs = len_lcs(candidate, references)
len_x = len(candidate)
len_y = len(references)

recall = lcs / len_y
precision = lcs / len_x
beta = precision / (recall + 1e-12)
numerator = (1 + (beta ** 2)) * (precision * recall)
denominator = (precision * (beta ** 2) + recall) + 1e-8
f1_score = numerator / denominator
return f1_score


def average_rouge(candidate, references):
rouge_1 = rouge_n(candidate, references, 1)
rouge_2 = rouge_n(candidate, references, 2)
rouge_lcs = rogue_l(split_into_words(candidate),
split_into_words(references))
avg_rouge = (rouge_1 + rouge_2 + rouge_lcs) / 3
print("rouge_1:", rouge_1)
print("rouge_2:", rouge_2)
print("rouge_lcs:", rouge_lcs)
print("average:", avg_rouge)


def main():
x = "The quick brown fox jumped over the wall"
y = "The fast black dog and fox jumped into the wall"
x_words = split_into_words(x)
y_words = split_into_words(y)
print(x_words)
lcs = len_lcs(x_words, y_words)
print(lcs)
average_rouge(x, y)


if __name__ == "__main__":
main()