Merge branch 'test_parsing'

# Conflicts: # server/server/api/reports.py
InbarShirizly · Oct 6, 2020 · 703b572 · 703b572
2 parents 485b53e + 1e90030
commit 703b572
Show file tree

Hide file tree

Showing 26 changed files with 434 additions and 104 deletions.
diff --git a/server/server/api/reports.py b/server/server/api/reports.py
@@ -1,6 +1,6 @@
 from server.api import api, custom_types
 from flask_restful import Resource, reqparse, abort, marshal
-from server.parsing.attendance_check import Attendance
+from server.parsing.attendance import Attendance
 from server import db, auth
 from datetime import datetime
 import pandas as pd

diff --git a/server/server/config.py b/server/server/config.py
@@ -8,13 +8,13 @@ class FlaskConfig:
 
 class ParseConfig:
     FILE_COLS_DICT = {
-        "name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט"],
-        "id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז"],
-        "phone": ["טלפון", "מספר טלפון", "מס טלפון"],
-        "gender": ["מין"],
-        "org_class": ["כיתה"]
+        "name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט", "name", "student_name", "student"],
+        "id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז", "id", "number_id", "id_number"],
+        "phone": ["טלפון", "מספר טלפון", "מס טלפון", "phone", "phone_number"],
+        "gender": ["מין", "gender"],
+        "org_class": ["כיתה", "org_class", "class"]
     }
-    MASHOV_COLS = ["name", "org_class"]
+    MASHOV_COLS = ["name", "org_class", "id_number"]
     GENDER_DICT = {1: ["זכר", "ז", "(ז)"], 0: ["נקבה", "נ", "(נ)"]}
 
 

diff --git a/server/server/parsing/__init__.py b/server/server/parsing/__init__.py
@@ -1,8 +1,8 @@
 from server.config import ParseConfig
-from server.parsing.loading_classroom_file import ParseClassFile
+from server.parsing.parse_class_file import ParseClassFile
 from collections import namedtuple
 
 
 parser = ParseClassFile.from_object(ParseConfig)
 
-AttendanceMetaData = namedtuple('meta_data', ['filter_modes', 'time_delta', 'start_sentence', 'not_included_zoom_users'])
+AttendanceMetaData = namedtuple('meta_data', ['filter_modes', 'time_delta', 'start_sentence', 'zoom_names_to_ignore'])
diff --git a/server/server/parsing/attendance_check.py → server/server/parsing/attendance.py b/server/server/parsing/attendance_check.py → server/server/parsing/attendance.py
@@ -11,18 +11,18 @@ class Attendance:
      1. report_sessions - session object
      2. student_status_table - df of "student" table
     """
-    def __init__(self, chat_df, students_df, filter_modes, time_delta, start_sentence, not_included_zoom_users):
+    def __init__(self, chat_df, students_df, filter_modes, time_delta, start_sentence, zoom_names_to_ignore):
         """
         :param chat_df: zoom chat (df)
         :param students_df: student class raw data (df)
         :param filter_modes: filters the user picked for parsing the text file (list of str)
         :param time_delta: max time from start sentence to the last message to parse in each session in minutes (int)
         :param start_sentence: start sentence that initiate sessions for parse (str)
-        :param not_included_zoom_users: zoom names that will not be considered (list of str)
+        :param zoom_names_to_ignore: zoom names that will not be considered (list of str)
         :return: data frame with the data from the chat
         """
         meta_data = AttendanceMetaData(filter_modes=filter_modes, time_delta=time_delta,
-                                       start_sentence=start_sentence, not_included_zoom_users=not_included_zoom_users)
+                                       start_sentence=start_sentence, zoom_names_to_ignore=zoom_names_to_ignore)
 
         self.first_message_time = chat_df["time"].sort_values().iloc[0] # get time of first message in the chat
         start_indices = Attendance.get_start_indices(chat_df, meta_data)
@@ -42,7 +42,7 @@ def get_start_indices(df, meta_data):
         :param meta_data: configurations of the user
         :return: list of indices of start of session
         """
-        not_included_zoom_users_filt = df['zoom_name'].str.contains('|'.join(meta_data.not_included_zoom_users))
+        not_included_zoom_users_filt = df['zoom_name'].str.contains('|'.join(meta_data.zoom_names_to_ignore))
         not_included_zoom_users_df = df[not_included_zoom_users_filt]
         check_sentence = lambda string: meta_data.start_sentence.lower() in string.lower()
         start_indices = not_included_zoom_users_df.index[not_included_zoom_users_df['message'].apply(check_sentence)]
@@ -86,24 +86,3 @@ def student_status_table(self, report_id):
         return df_status_report.loc[:, ["student_id", "report_id", "status"]]
 
 
-if __name__ == '__main__':
-    from collections import namedtuple
-    from utils import create_chat_df, create_students_df
-
-    AttendanceMetaData = namedtuple('meta_data',
-                                    ['filter_modes', 'time_delta', 'start_sentence', 'not_included_zoom_users'])
-
-    chat_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\chat files\meeting_example_full_name.txt"
-    excel_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\student_csv_examples\example_data_already_prepared.xlsx"
-
-
-    with open(chat_file_path, "r", encoding="utf-8") as f:
-        chat_df = create_chat_df(f.readlines())
-    df_students = create_students_df(file_name=excel_file_path.split("\\")[-1], file_data=excel_file_path)
-
-    my_class = Attendance(chat_df, df_students, ['name', "id_number", "phone"], 1, "Attendance check", ["ITC", "Tech", "Challenge"])
-    a = my_class.student_status_table(1)
-    print(a)
-    # df_part_session = my_class._sessions[0]
-    # df_part_session.zoom_names_table(2)
-
diff --git a/server/server/parsing/loading_classroom_file.py b/server/server/parsing/loading_classroom_file.py
diff --git a/server/server/parsing/parse_class_file.py b/server/server/parsing/parse_class_file.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import numpy as np
+import re
+
+DELETE_ROWS_CONTAIN = ["הופק בתאריך"]  #TODO: need to remove to config
+
+class ParseClassFile:
+
+    def __init__(self, file_cols_dict, mashov_cols, gender_dict):
+        self._file_cols_dict = file_cols_dict
+        self._mashov_cols = mashov_cols
+        self._gender_dict = gender_dict
+
+    @classmethod
+    def from_object(cls, config):
+        return cls(
+            config.FILE_COLS_DICT,
+            config.MASHOV_COLS,
+            config.GENDER_DICT
+        )
+
+    def parse_df(self, df_students):
+
+        if ParseClassFile.check_if_mashov_file(df_students):
+            df_students = self.mashov_file(df_students)
+        else:
+            df_students = self.classic_file(df_students)
+
+        for col in self._file_cols_dict.keys():
+            try:
+                df_students[col] = df_students[col]
+            except KeyError:
+                df_students[col] = pd.Series([np.nan] * df_students.shape[0])
+
+        final_df = df_students[list(self._file_cols_dict.keys())]
+
+        return final_df.reset_index().drop(columns="index")
+
+
+    @staticmethod
+    def check_if_mashov_file(df_students):
+        df_students.dropna(axis=0, how="all", inplace=True)
+        df_students.dropna(axis=1, how="all", inplace=True)
+
+        for col in df_students.columns:
+            if df_students[col].astype(str).str.match(r"(\d+.)([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)").any():
+                df_students.rename(columns={col: "name"}, inplace=True)
+                return True
+        return False
+
+    def mashov_file(self, df_students):
+        df_t = df_students.T
+        cols_to_drop = []
+        for col in df_t.columns:
+            if df_t[col].str.contains('|'.join(DELETE_ROWS_CONTAIN)).any():
+                cols_to_drop.append(col)
+        df_students = df_t.drop(columns=cols_to_drop).T
+
+        df_students.rename(columns={"ת.ז.": 'id_number', "כיתה": "org_class"}, inplace=True)
+        try:
+            df_students = df_students.loc[:, self._mashov_cols]
+        except KeyError:
+            raise ValueError("File content is invalid to the program configurations")
+
+        mashov_name_pattern = re.compile(r"([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)")
+        df_name_gender = df_students['name'].str.extract(mashov_name_pattern, expand=False)
+        df_students['gender'] = df_name_gender[1].str.extract("\(([\u0590-\u05fe ])\)")
+        df_students['gender'] = df_students['gender'].apply(self.gender_assign, gender_dict=self._gender_dict)
+        df_students['name'] = df_name_gender[0]
+        return df_students
+
+
+    def classic_file(self, df_students):
+        relevant_cols = [col for col in df_students.columns if not col.startswith("Unnamed")]
+        current_excel_dict = {}
+        for col in relevant_cols:
+            for key, col_options in self._file_cols_dict.items():
+                if col in col_options:
+                    current_excel_dict[key] = df_students[col]
+        return pd.DataFrame(current_excel_dict)
+
+
+    @staticmethod
+    def gender_assign(string, gender_dict):
+        for key, vals in gender_dict.items():
+            if string in vals:
+                return key
+        return ""
diff --git a/server/server/parsing/session.py b/server/server/parsing/session.py
@@ -26,14 +26,13 @@ def get_participants_in_session(df_students, df_chat, meta_data):
         df_participated["index"] = df_participated["index"].astype(int)
         df_participated = df_participated.loc[:, ["id", "zoom_name", "time", "message", "index"]].set_index("index")
 
-        filt = df_chat['zoom_name'].str.contains('|'.join(meta_data.not_included_zoom_users))
+        filt = df_chat['zoom_name'].str.contains('|'.join(meta_data.zoom_names_to_ignore))
         df_relevant_chat = pd.merge(df_chat[~filt], df_participated, how="left")
 
         df_relevant_chat["relevant"] = df_relevant_chat["id"].apply(lambda x: 1 if x == x else 0)
         df_relevant_chat["id"] = df_relevant_chat["id"].apply(lambda x: int(x) if x == x else -1)
         return df_relevant_chat
 
-
     def zoom_names_table(self, session_id):
         zoom_df = self._relevant_chat.loc[:, ["zoom_name", "id"]].rename(columns={"zoom_name": "name", "id": "student_id"})
         zoom_df['session_id'] = pd.Series([session_id] * zoom_df.shape[0])

diff --git a/server/server/parsing/utils.py b/server/server/parsing/utils.py
@@ -10,15 +10,41 @@ def create_chat_df(chat_file):
     chat_df = pd.DataFrame(chat_content, columns=["time", "zoom_name", "message"])
     chat_df['message'] = chat_df['message'].str[:-1].astype(str)
     chat_df["time"] = chat_df["time"].apply(lambda string: datetime.strptime(string, "%H:%M:%S"))
-
+    if chat_df.empty:
+        raise ValueError("Entered file is empty")
     return chat_df
 
 
 def create_students_df(file_name, file_data):
     if file_name.endswith(".csv"):
-        df_students = pd.read_csv(file_data)
+        df_students = pd.read_csv(file_data, header=None)
     elif file_name.endswith(".xlsx"):
-        df_students = pd.read_excel(file_data)
+        df_students = pd.read_excel(file_data, header=None)
     else:
-        df_students = pd.read_html(file_data, header=1)[0]
-    return df_students
+        try:
+            df_students = pd.read_html(file_data, header=1)[0]
+        except ValueError:
+            df_students = pd.ExcelFile(file_data).parse()
+
+    clean_df = clean_student_df(df_students)
+
+    if clean_df.shape[0] > 200:
+        raise ValueError("Input file have to many records")  #TODO: pass amount of records as config
+    if clean_df.empty:
+        raise ValueError("Entered file is empty")
+    return clean_df
+
+
+def clean_student_df(df_students):
+    # # first drop al columns that are totally missing (for extreme cases)
+    df_students.dropna(axis=0, how="all", inplace=True)
+    df_students.dropna(axis=1, how="all", inplace=True)
+
+    # check for unique values in columns - must have at list 3 unique values (min of title and 2 students
+    min_nunique_in_cols = max(df_students.nunique().median(), 3)
+    filt_relevant_cols = df_students.nunique() >= min_nunique_in_cols
+    df_students = df_students.loc[:, filt_relevant_cols]
+    df_students = pd.DataFrame(df_students.values[1:], columns=df_students.iloc[0])
+    return df_students
+
+
diff --git a/server/test/files_to_test/chat_files/chat_file_empty.txt b/server/test/files_to_test/chat_files/chat_file_empty.txt
diff --git a/server/test/files_to_test/chat_files/chat_file_not_structured.txt b/server/test/files_to_test/chat_files/chat_file_not_structured.txt
@@ -0,0 +1 @@
+I love to go to school
diff --git a/server/test/files_to_test/chat_files/chat_file_not_structured_partially.txt b/server/test/files_to_test/chat_files/chat_file_not_structured_partially.txt
@@ -0,0 +1,4 @@
+10:56:16	 From  
+10:56:18	 From  
+10:56:19	 From  
+10:56:20	 From 
diff --git a/server/test/files_to_test/chat_files/chat_file_valid.txt b/server/test/files_to_test/chat_files/chat_file_valid.txt
@@ -0,0 +1,32 @@
+# teacher = "Roei teacher", "start sentence": "בדיקת נוכחות, zoom_names_to_ignore = ["Roei", "Elad Visitor"]
+
+# session 1 - should find ["מיכל קליימן", "ענבר עדי", "זאב הרצל"]. In case of >=3 minutes check & check with phone (as well) - will find "מיתר כהן" 
+
+10:46:16	 From  Roei teacher : בדיקת נוכחות:
+10:46:18	 From  Idan Aviv : here
+10:46:19	 From  Michal : מיכל קליימן
+10:46:20	 From  Inbar Adi : ענבר עדי
+10:46:22	 From  Zeave H : 305696031
+10:48:22	 From  Meitar : 537642324
+
+
+# session 2 - should find ["מיכל קליימן", "ענבר עדי", "אביתר כהן", "עידן אביב"]. here "Zeave H" (wrote another name and then himself - will ignore it). minutes  >=2 for "מיתר כהן"
+10:57:22	 From  Roei teacher : בדיקת נוכחות:
+10:57:42	 From  Zeave H : אביתר כהן
+10:57:43	 From  Idan Aviv : עידן אביב
+10:57:52	 From  Zeave H : 305696031
+10:57:53	 From  Inbar Adi : ענבר עדי
+10:57:54	 From  Michal : מיכל קליימן
+10:57:55	 From  Dana : here
+10:58:46	 From  Ron : רון זהבי
+10:58:54	 From  Meitar : 537642324
+
+# session 3 - should find ["מיכל קליימן", "עידן אביב"] - checking not auth user writing the "start sentence"
+
+11:57:16	 From  Roei teacher : בדיקת נוכחות
+11:57:43	 From  Idan Aviv : עידן אביב
+11:57:43	 From  Idan Aviv : שלום
+11:57:45	 From  Idan Aviv : בדיקת נוכחות
+11:57:46	 From  Elad Visitor : my not relevant message I'm visitor
+11:57:54	 From  Michal : מיכל קליימן
+
diff --git a/server/test/files_to_test/students_list_excel/example_csv.csv b/server/test/files_to_test/students_list_excel/example_csv.csv
@@ -0,0 +1,10 @@
+id,phone,id_number,name,org_class,gender,class_id
+11,528702484,305049421,???? ????,,,
+12,524291930,123424343,????? ???,,,
+13,526148959,432424455,???? ??????,,,
+14,523454564,423423649,???? ???,,,
+15,530342423,305696031,??? ????,,,
+16,530342413,305696041,???? ??????,,,
+17,537642324,534234210,???? ???,,,
+,,,,,,
+18,537642324,534234453,???? ???,,,
diff --git a/server/test/files_to_test/students_list_excel/example_csv_2.csv b/server/test/files_to_test/students_list_excel/example_csv_2.csv
@@ -0,0 +1,8 @@
+id,phone,id_number,name,org_class,gender,class_id,
+11,528702484,305049421,???? ????,,,,
+12,524291930,123424343,????? ???,,,,
+13,526148959,432424455,???? ??????,,,,
+14,523454564,423423649,???? ???,,,,
+15,530342423,305696031,??? ????,,,,
+16,530342413,305696041,???? ??????,,,,
+17,537642324,534234210,???? ???,,,,
diff --git a/server/test/files_to_test/students_list_excel/example_csv_3.csv b/server/test/files_to_test/students_list_excel/example_csv_3.csv
@@ -0,0 +1 @@
+id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,

diff --git a/server/test/files_to_test/students_list_excel/example_csv_4.csv b/server/test/files_to_test/students_list_excel/example_csv_4.csv
@@ -0,0 +1,8 @@
+id,phone,id_number,name,org_class,gender,class_id,
+11,528702484,305049421,???? ????,,,,
+12,524291930,123424343,????? ???,,,,
+13,526148959,432424455,???? ??????,,,,
+14,523454564,423423649,???? ???,,,,
+15,530342423,305696031,??? ????,,,,
+16,530342413,305696041,???? ??????,,,,
+17,537642324,534234210,???? ???,,,,
diff --git a/server/test/files_to_test/students_list_excel/example_excel.xlsx b/server/test/files_to_test/students_list_excel/example_excel.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_excel_start_in_random_row.xlsx b/server/test/files_to_test/students_list_excel/example_excel_start_in_random_row.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_excel_too_much_records.xlsx b/server/test/files_to_test/students_list_excel/example_excel_too_much_records.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_mashov_file_edited_and_saved_97.xls b/server/test/files_to_test/students_list_excel/example_mashov_file_edited_and_saved_97.xls
diff --git a/..._to_test/students_list_excel/example_mashov_file_edited_and_saved_97_with_filled_data.xls b/..._to_test/students_list_excel/example_mashov_file_edited_and_saved_97_with_filled_data.xls
diff --git a/server/test/files_to_test/students_list_excel/example_mashov_file_empty.xls b/server/test/files_to_test/students_list_excel/example_mashov_file_empty.xls
diff --git a/server/test/files_to_test/students_list_excel/דוגמה לרשימת תלמידים.xlsx b/server/test/files_to_test/students_list_excel/דוגמה לרשימת תלמידים.xlsx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,
Expand Down