-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIMDB_utils.py
79 lines (68 loc) · 3.03 KB
/
IMDB_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 2 12:44:30 2020
@author: nifaullah
"""
import os
import numpy as np
import pandas as pd
import glob
import errno
# Path where you want to store your dataset or where the dataset is already stored
_path = ""
# =============================================================================
# Function to create 2 dataframes each for train & test data given that the text
# files are in same structure as downloaded (i.e) ".../aclImdb/". This method
# uses another local method load one dataset which is used to create one
# dataset at a time based on the path.
# Function doesnot take any input and returns a dictionary as output, with each
# dataframe mapped to a string.
# dict["train"] wil give you the training set & likewise for test.
# =============================================================================
def LoadDatasets():
dataset_dict = {}
# Directory where are text files are stored in default structure
dir = ".../aclImdb/"
env = ["train", "test"]
sentiment = ["pos", "neg"]
all_text = "*.txt"
for _env in env:
df = pd.DataFrame()
for _sentiment in sentiment:
path = f"{dir}{_env}/{_sentiment}/{all_text}"
df = pd.concat([df,LoadOneDataset(path, _sentiment)])
dataset_dict[_env] = df
return dataset_dict
# =============================================================================
# Function to read text files from the given path and create a dataframe by
# adding each textfile as a row with it's corresponding labeled sentiment.
# function takes 2 inputs path of the file & sentiment & returns a dataframe
# with 2 columns review & sentiment
# =============================================================================
def LoadOneDataset(path, sentiment):
sentiment_dict = {"neg": 0, "pos": 1}
files = glob.glob(path)
content = []
with open(name, 'r', encoding="utf8") as file:
content.append(file.readlines())
df = pd.DataFrame(content,columns=["review"])
df["sentiment"] = sentiment_dict[sentiment]
return df
# =============================================================================
# Wrapper method to load IMDB dataset first it checks if a file is present in
# local already, if not then builds the dataframe & saves it to local, if
# yes loads the dataframe from the local excel file.
# =============================================================================
def LoadImdbDatasets():
_train = "train.xlsx"
_test = "test.xlsx"
if not (os.path.isfile(f"{_path}{_train}") & os.path.isfile(f"{_path}{_test}")):
datasets = LoadDatasets()
train_df = datasets["train"]
test_df = datasets["test"]
train_df.to_excel(f"{path}{_train}", index=False)
test_df.to_excel(f"{path}{_test}", index=False)
else:
train_df = pd.read_excel(f"{_path}{_train}")
test_df = pd.read_excel(f"{_path}{_test}")
return train_df, test_df