Skip to content

Commit

Permalink
Merge branch 'test_parsing'
Browse files Browse the repository at this point in the history
# Conflicts:
#	server/server/api/reports.py
  • Loading branch information
InbarShirizly committed Oct 6, 2020
2 parents 485b53e + 1e90030 commit 703b572
Show file tree
Hide file tree
Showing 26 changed files with 434 additions and 104 deletions.
2 changes: 1 addition & 1 deletion server/server/api/reports.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from server.api import api, custom_types
from flask_restful import Resource, reqparse, abort, marshal
from server.parsing.attendance_check import Attendance
from server.parsing.attendance import Attendance
from server import db, auth
from datetime import datetime
import pandas as pd
Expand Down
12 changes: 6 additions & 6 deletions server/server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ class FlaskConfig:

class ParseConfig:
FILE_COLS_DICT = {
"name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט"],
"id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז"],
"phone": ["טלפון", "מספר טלפון", "מס טלפון"],
"gender": ["מין"],
"org_class": ["כיתה"]
"name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט", "name", "student_name", "student"],
"id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז", "id", "number_id", "id_number"],
"phone": ["טלפון", "מספר טלפון", "מס טלפון", "phone", "phone_number"],
"gender": ["מין", "gender"],
"org_class": ["כיתה", "org_class", "class"]
}
MASHOV_COLS = ["name", "org_class"]
MASHOV_COLS = ["name", "org_class", "id_number"]
GENDER_DICT = {1: ["זכר", "ז", "(ז)"], 0: ["נקבה", "נ", "(נ)"]}


Expand Down
4 changes: 2 additions & 2 deletions server/server/parsing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from server.config import ParseConfig
from server.parsing.loading_classroom_file import ParseClassFile
from server.parsing.parse_class_file import ParseClassFile
from collections import namedtuple


parser = ParseClassFile.from_object(ParseConfig)

AttendanceMetaData = namedtuple('meta_data', ['filter_modes', 'time_delta', 'start_sentence', 'not_included_zoom_users'])
AttendanceMetaData = namedtuple('meta_data', ['filter_modes', 'time_delta', 'start_sentence', 'zoom_names_to_ignore'])
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,18 @@ class Attendance:
1. report_sessions - session object
2. student_status_table - df of "student" table
"""
def __init__(self, chat_df, students_df, filter_modes, time_delta, start_sentence, not_included_zoom_users):
def __init__(self, chat_df, students_df, filter_modes, time_delta, start_sentence, zoom_names_to_ignore):
"""
:param chat_df: zoom chat (df)
:param students_df: student class raw data (df)
:param filter_modes: filters the user picked for parsing the text file (list of str)
:param time_delta: max time from start sentence to the last message to parse in each session in minutes (int)
:param start_sentence: start sentence that initiate sessions for parse (str)
:param not_included_zoom_users: zoom names that will not be considered (list of str)
:param zoom_names_to_ignore: zoom names that will not be considered (list of str)
:return: data frame with the data from the chat
"""
meta_data = AttendanceMetaData(filter_modes=filter_modes, time_delta=time_delta,
start_sentence=start_sentence, not_included_zoom_users=not_included_zoom_users)
start_sentence=start_sentence, zoom_names_to_ignore=zoom_names_to_ignore)

self.first_message_time = chat_df["time"].sort_values().iloc[0] # get time of first message in the chat
start_indices = Attendance.get_start_indices(chat_df, meta_data)
Expand All @@ -42,7 +42,7 @@ def get_start_indices(df, meta_data):
:param meta_data: configurations of the user
:return: list of indices of start of session
"""
not_included_zoom_users_filt = df['zoom_name'].str.contains('|'.join(meta_data.not_included_zoom_users))
not_included_zoom_users_filt = df['zoom_name'].str.contains('|'.join(meta_data.zoom_names_to_ignore))
not_included_zoom_users_df = df[not_included_zoom_users_filt]
check_sentence = lambda string: meta_data.start_sentence.lower() in string.lower()
start_indices = not_included_zoom_users_df.index[not_included_zoom_users_df['message'].apply(check_sentence)]
Expand Down Expand Up @@ -86,24 +86,3 @@ def student_status_table(self, report_id):
return df_status_report.loc[:, ["student_id", "report_id", "status"]]


if __name__ == '__main__':
from collections import namedtuple
from utils import create_chat_df, create_students_df

AttendanceMetaData = namedtuple('meta_data',
['filter_modes', 'time_delta', 'start_sentence', 'not_included_zoom_users'])

chat_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\chat files\meeting_example_full_name.txt"
excel_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\student_csv_examples\example_data_already_prepared.xlsx"


with open(chat_file_path, "r", encoding="utf-8") as f:
chat_df = create_chat_df(f.readlines())
df_students = create_students_df(file_name=excel_file_path.split("\\")[-1], file_data=excel_file_path)

my_class = Attendance(chat_df, df_students, ['name', "id_number", "phone"], 1, "Attendance check", ["ITC", "Tech", "Challenge"])
a = my_class.student_status_table(1)
print(a)
# df_part_session = my_class._sessions[0]
# df_part_session.zoom_names_table(2)

63 changes: 0 additions & 63 deletions server/server/parsing/loading_classroom_file.py

This file was deleted.

88 changes: 88 additions & 0 deletions server/server/parsing/parse_class_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pandas as pd
import numpy as np
import re

DELETE_ROWS_CONTAIN = ["הופק בתאריך"] #TODO: need to remove to config

class ParseClassFile:

def __init__(self, file_cols_dict, mashov_cols, gender_dict):
self._file_cols_dict = file_cols_dict
self._mashov_cols = mashov_cols
self._gender_dict = gender_dict

@classmethod
def from_object(cls, config):
return cls(
config.FILE_COLS_DICT,
config.MASHOV_COLS,
config.GENDER_DICT
)

def parse_df(self, df_students):

if ParseClassFile.check_if_mashov_file(df_students):
df_students = self.mashov_file(df_students)
else:
df_students = self.classic_file(df_students)

for col in self._file_cols_dict.keys():
try:
df_students[col] = df_students[col]
except KeyError:
df_students[col] = pd.Series([np.nan] * df_students.shape[0])

final_df = df_students[list(self._file_cols_dict.keys())]

return final_df.reset_index().drop(columns="index")


@staticmethod
def check_if_mashov_file(df_students):
df_students.dropna(axis=0, how="all", inplace=True)
df_students.dropna(axis=1, how="all", inplace=True)

for col in df_students.columns:
if df_students[col].astype(str).str.match(r"(\d+.)([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)").any():
df_students.rename(columns={col: "name"}, inplace=True)
return True
return False

def mashov_file(self, df_students):
df_t = df_students.T
cols_to_drop = []
for col in df_t.columns:
if df_t[col].str.contains('|'.join(DELETE_ROWS_CONTAIN)).any():
cols_to_drop.append(col)
df_students = df_t.drop(columns=cols_to_drop).T

df_students.rename(columns={"ת.ז.": 'id_number', "כיתה": "org_class"}, inplace=True)
try:
df_students = df_students.loc[:, self._mashov_cols]
except KeyError:
raise ValueError("File content is invalid to the program configurations")

mashov_name_pattern = re.compile(r"([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)")
df_name_gender = df_students['name'].str.extract(mashov_name_pattern, expand=False)
df_students['gender'] = df_name_gender[1].str.extract("\(([\u0590-\u05fe ])\)")
df_students['gender'] = df_students['gender'].apply(self.gender_assign, gender_dict=self._gender_dict)
df_students['name'] = df_name_gender[0]
return df_students


def classic_file(self, df_students):
relevant_cols = [col for col in df_students.columns if not col.startswith("Unnamed")]
current_excel_dict = {}
for col in relevant_cols:
for key, col_options in self._file_cols_dict.items():
if col in col_options:
current_excel_dict[key] = df_students[col]
return pd.DataFrame(current_excel_dict)


@staticmethod
def gender_assign(string, gender_dict):
for key, vals in gender_dict.items():
if string in vals:
return key
return ""
3 changes: 1 addition & 2 deletions server/server/parsing/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,13 @@ def get_participants_in_session(df_students, df_chat, meta_data):
df_participated["index"] = df_participated["index"].astype(int)
df_participated = df_participated.loc[:, ["id", "zoom_name", "time", "message", "index"]].set_index("index")

filt = df_chat['zoom_name'].str.contains('|'.join(meta_data.not_included_zoom_users))
filt = df_chat['zoom_name'].str.contains('|'.join(meta_data.zoom_names_to_ignore))
df_relevant_chat = pd.merge(df_chat[~filt], df_participated, how="left")

df_relevant_chat["relevant"] = df_relevant_chat["id"].apply(lambda x: 1 if x == x else 0)
df_relevant_chat["id"] = df_relevant_chat["id"].apply(lambda x: int(x) if x == x else -1)
return df_relevant_chat


def zoom_names_table(self, session_id):
zoom_df = self._relevant_chat.loc[:, ["zoom_name", "id"]].rename(columns={"zoom_name": "name", "id": "student_id"})
zoom_df['session_id'] = pd.Series([session_id] * zoom_df.shape[0])
Expand Down
36 changes: 31 additions & 5 deletions server/server/parsing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,41 @@ def create_chat_df(chat_file):
chat_df = pd.DataFrame(chat_content, columns=["time", "zoom_name", "message"])
chat_df['message'] = chat_df['message'].str[:-1].astype(str)
chat_df["time"] = chat_df["time"].apply(lambda string: datetime.strptime(string, "%H:%M:%S"))

if chat_df.empty:
raise ValueError("Entered file is empty")
return chat_df


def create_students_df(file_name, file_data):
if file_name.endswith(".csv"):
df_students = pd.read_csv(file_data)
df_students = pd.read_csv(file_data, header=None)
elif file_name.endswith(".xlsx"):
df_students = pd.read_excel(file_data)
df_students = pd.read_excel(file_data, header=None)
else:
df_students = pd.read_html(file_data, header=1)[0]
return df_students
try:
df_students = pd.read_html(file_data, header=1)[0]
except ValueError:
df_students = pd.ExcelFile(file_data).parse()

clean_df = clean_student_df(df_students)

if clean_df.shape[0] > 200:
raise ValueError("Input file have to many records") #TODO: pass amount of records as config
if clean_df.empty:
raise ValueError("Entered file is empty")
return clean_df


def clean_student_df(df_students):
# # first drop al columns that are totally missing (for extreme cases)
df_students.dropna(axis=0, how="all", inplace=True)
df_students.dropna(axis=1, how="all", inplace=True)

# check for unique values in columns - must have at list 3 unique values (min of title and 2 students
min_nunique_in_cols = max(df_students.nunique().median(), 3)
filt_relevant_cols = df_students.nunique() >= min_nunique_in_cols
df_students = df_students.loc[:, filt_relevant_cols]
df_students = pd.DataFrame(df_students.values[1:], columns=df_students.iloc[0])
return df_students


Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I love to go to school
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
10:56:16 From
10:56:18 From
10:56:19 From
10:56:20 From
32 changes: 32 additions & 0 deletions server/test/files_to_test/chat_files/chat_file_valid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# teacher = "Roei teacher", "start sentence": "בדיקת נוכחות, zoom_names_to_ignore = ["Roei", "Elad Visitor"]

# session 1 - should find ["מיכל קליימן", "ענבר עדי", "זאב הרצל"]. In case of >=3 minutes check & check with phone (as well) - will find "מיתר כהן"

10:46:16 From Roei teacher : בדיקת נוכחות:
10:46:18 From Idan Aviv : here
10:46:19 From Michal : מיכל קליימן
10:46:20 From Inbar Adi : ענבר עדי
10:46:22 From Zeave H : 305696031
10:48:22 From Meitar : 537642324


# session 2 - should find ["מיכל קליימן", "ענבר עדי", "אביתר כהן", "עידן אביב"]. here "Zeave H" (wrote another name and then himself - will ignore it). minutes >=2 for "מיתר כהן"
10:57:22 From Roei teacher : בדיקת נוכחות:
10:57:42 From Zeave H : אביתר כהן
10:57:43 From Idan Aviv : עידן אביב
10:57:52 From Zeave H : 305696031
10:57:53 From Inbar Adi : ענבר עדי
10:57:54 From Michal : מיכל קליימן
10:57:55 From Dana : here
10:58:46 From Ron : רון זהבי
10:58:54 From Meitar : 537642324

# session 3 - should find ["מיכל קליימן", "עידן אביב"] - checking not auth user writing the "start sentence"

11:57:16 From Roei teacher : בדיקת נוכחות
11:57:43 From Idan Aviv : עידן אביב
11:57:43 From Idan Aviv : שלום
11:57:45 From Idan Aviv : בדיקת נוכחות
11:57:46 From Elad Visitor : my not relevant message I'm visitor
11:57:54 From Michal : מיכל קליימן

10 changes: 10 additions & 0 deletions server/test/files_to_test/students_list_excel/example_csv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
id,phone,id_number,name,org_class,gender,class_id
11,528702484,305049421,???? ????,,,
12,524291930,123424343,????? ???,,,
13,526148959,432424455,???? ??????,,,
14,523454564,423423649,???? ???,,,
15,530342423,305696031,??? ????,,,
16,530342413,305696041,???? ??????,,,
17,537642324,534234210,???? ???,,,
,,,,,,
18,537642324,534234453,???? ???,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
id,phone,id_number,name,org_class,gender,class_id,
11,528702484,305049421,???? ????,,,,
12,524291930,123424343,????? ???,,,,
13,526148959,432424455,???? ??????,,,,
14,523454564,423423649,???? ???,,,,
15,530342423,305696031,??? ????,,,,
16,530342413,305696041,???? ??????,,,,
17,537642324,534234210,???? ???,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
id,phone,id_number,name,org_class,gender,class_id,
11,528702484,305049421,???? ????,,,,
12,524291930,123424343,????? ???,,,,
13,526148959,432424455,???? ??????,,,,
14,523454564,423423649,???? ???,,,,
15,530342423,305696031,??? ????,,,,
16,530342413,305696041,???? ??????,,,,
17,537642324,534234210,???? ???,,,,
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 703b572

Please sign in to comment.