From 9dda5d5491290d04e46f7b710115e2af295e408b Mon Sep 17 00:00:00 2001 From: Michael Rossetti Date: Thu, 14 Dec 2023 16:00:28 -0500 Subject: [PATCH] Submissions Processing (#1) Improves processing and comparison of all submission notebook documents. --- .gitignore | 12 ++ README.md | 75 +++++++++- app/__init__.py | 6 + app/cell.py | 33 +++++ app/document_formatting.py | 33 +++++ app/document_processor.py | 138 +++++++++--------- app/jobs/starter.py | 45 ------ app/jobs/submissions.py | 128 ----------------- app/prompts.py | 6 + app/starter_doc_processor.py | 76 ++++++++++ app/submissions_manager.py | 21 +-- app/submissions_processor.py | 219 +++++++++++++++++++++++++++++ conftest.py | 7 + results/.gitkeep | 0 test/cell_test.py | 29 ++++ test/document_processor_test.py | 22 +-- test/results/cells.csv | 34 +++++ test/results/notebooks.csv | 3 + test/submissions_manager_test.py | 27 ++++ test/submissions_processor_test.py | 69 +++++++++ 20 files changed, 714 insertions(+), 269 deletions(-) create mode 100644 app/__init__.py create mode 100644 app/cell.py create mode 100644 app/document_formatting.py delete mode 100644 app/jobs/starter.py delete mode 100644 app/jobs/submissions.py create mode 100644 app/prompts.py create mode 100644 app/starter_doc_processor.py create mode 100644 app/submissions_processor.py create mode 100644 results/.gitkeep create mode 100644 test/cell_test.py create mode 100644 test/results/cells.csv create mode 100644 test/results/notebooks.csv create mode 100644 test/submissions_manager_test.py create mode 100644 test/submissions_processor_test.py diff --git a/.gitignore b/.gitignore index 68bc17f..653bd81 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,15 @@ + +# ignore artifacts saved to results dir: +results/*.csv +results/*.png +results/*.html + +# ignore artifacts from testing (we just saved a temporary copy of results there for example purposes) +test/results/* + + + + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 308c487..9ac473b 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,97 @@ -# homework-grader-py +# Langchain TA (Automated Homework Grading Agent) +An AI agent for grading homework assignments submitted as .IPYNB notebook documents. -Setup environment: +For this particular use case we assume the homework submission documents are based on a common "starter" / instructions document. And we will grade the homeworks based only on the differences (i.e. unique submission content only). + +Capabilities: + + 1. **Cell-based Document Splitting**: We use intelligent cell-based splitting of the .IPYNB notebook documents that allows us to reference each cell separately, and reference the code cells and text cells separately, as needed. We generate artifacts from the document splitting process like a CSV file of all cell contents and metadata, to help speed up the grading process, without the use of AI agents. + + 2. **Document Retrieval**: We use text embedding models to query the documents, to find the most relevant cell content for each question. We generate artifacts from the relevance search process which may further speed up the grading process without the use of AI agents. + + 3. **Retreival Augmented Generation (RAG)**: Finally we leverage an AI agent to grade each homework document based on the relevant cell contents for each question. We feed the agent only the relevant contents for each question, rather than the entire submissions file, to cut down on costs, as currently we are using OpenAI based LLM models that incur costs based on the number of tokens / characters used in the prompts we pass to the model. + + + +## Setup + +### Environment Setup + +Setup virtual environment: ```sh conda create -n langchain-2024 python=3.10 conda activate langchain-2024 +``` +Install package dependencies: + +```sh pip install -r requirements.txt ``` -Create ".env" file: +### Submission Files Setup + +Setup submission files: + +1. Download submission files from the learning management system. It will be a zip file of .IPYNB files. +2. Unzip, and note the directory (i.e. `SUBMISSIONS_DIRPATH`). +3. Move a copy of the starter notebook (which contains instructions and some starer code) into the submissions directory, and rename it so it contains "STARTER" somewhere in the file name. + + +### OpenAI Setup + +Obtain an OpenAI API Key (i.e. `OPENAI_API_KEY`). + + +### Environment Variables Setup + +Create ".env" file and set environment variables: + +```sh +# this is the ".env" file... -``` OPENAI_API_KEY="sk-..." + SUBMISSIONS_DIRPATH="/Users/USERNAME/Desktop/GRADING HW 4" ``` ## Usage +Demonstrate ability to access submission files: + ```sh python -m app.submissions_manager ``` +Process the starter file: + +```sh +python -m app.starter_doc_processor + +# FIG_SHOW=false python -m app.starter_doc_processor + +# FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 python -m app.starter_doc_processor + +# FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 SIMILARITY_THRESHOLD=0.75 python -m app.starter_doc_processor +``` + +Process all submission files: + +```sh +python -m app.submissions_processor + +#FIG_SHOW=false CHUNK_SIZE=600 CHUNK_OVERLAP=0 python -m app.submissions_processor +``` + +## Testing + +Run tests: ```sh -python -m app.document_processor +pytest --disable-pytest-warnings ``` diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..2b9aec5 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,6 @@ + + + +import os + +RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "..", "results") diff --git a/app/cell.py b/app/cell.py new file mode 100644 index 0000000..77d7d35 --- /dev/null +++ b/app/cell.py @@ -0,0 +1,33 @@ + +from langchain.docstore.document import Document + +from app.text_splitter import parse_cell_type + +EMPTY_CODE_CELL = "'code' cell: '[]'" +EMPTY_TEXT_CELL = "'markdown' cell: '[]'" + +class Cell(Document): + # https://github.com/langchain-ai/langchain/blob/451c5d1d8c857e61991a586a5ac94190947e2d80/libs/core/langchain_core/documents/base.py#L9 + + def __init__(self, page_content:str, metadata=None): + metadata = metadata or {} + super().__init__(page_content=str(page_content), metadata=metadata, type="Document") + + self.metadata["cell_type"] = parse_cell_type(self.page_content) + self.metadata["is_empty"] = self.is_empty + + @property + def cell_type(self): + return self.metadata["cell_type"] + + @property + def is_code(self): + return bool(self.cell_type == "CODE") + + @property + def is_text(self): + return bool(self.cell_type == "TEXT") + + @property + def is_empty(self): + return bool(self.page_content.strip() in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL]) diff --git a/app/document_formatting.py b/app/document_formatting.py new file mode 100644 index 0000000..dda5232 --- /dev/null +++ b/app/document_formatting.py @@ -0,0 +1,33 @@ + + + + + +def print_docs(docs, meta=False): + for doc in docs: + #print("----") + print(doc.page_content[0:50], "...", doc.page_content[-25:]) + if meta: + print(doc.metadata) + + + +def print_rows(rows): + for _, row in rows.iterrows(): + #print("----") + print(row["page_content"][0:50], "...", row["page_content"][-25:]) + + + + +from pandas import DataFrame + +def documents_to_df(docs): + """Converts list of Docs to a DataFrame. Includes columns for doc metadata and page content.""" + records = [] + for doc in docs: + metadata = doc.metadata + metadata["page_content"] = doc.page_content + records.append(metadata) + df = DataFrame(records) + return df diff --git a/app/document_processor.py b/app/document_processor.py index cf68511..afc0c98 100644 --- a/app/document_processor.py +++ b/app/document_processor.py @@ -15,33 +15,23 @@ from langchain.text_splitter import CharacterTextSplitter, PythonCodeTextSplitter, MarkdownTextSplitter #, MarkdownHeaderTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores.faiss import FAISS +from langchain.retrievers.document_compressors import EmbeddingsFilter +from langchain.retrievers import ContextualCompressionRetriever -from app.text_splitter import split_text_by_substrings, parse_cell_type, TEXT_CELL_PREFIX, CODE_CELL_PREFIX from app.colors import CELL_COLORS_MAP +from app.text_splitter import split_text_by_substrings #parse_cell_type, TEXT_CELL_PREFIX, CODE_CELL_PREFIX +from app.cell import Cell +from app.document_formatting import documents_to_df + load_dotenv() -CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", default="1_000")) +CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", default="1_000")) # 500 CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", default="0")) # 40 +SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", default="0.75")) -#def print_docs(docs, meta=False): -# for doc in docs: -# #print("----") -# print(doc.page_content[0:50], "...", doc.page_content[-10:]) -# if meta: -# print(doc.metadata) - - -#class Cell(Document): -# -# #def metadata(self): -# # meta = super().metadata -# # meta["cell_type"] = parse_cell_type(self.page_content) -# # return meta -# -# def cell_type(self): -# return parse_cell_type(self.page_content) +FIG_SHOW = bool(os.getenv("FIG_SHOW", default="true") == "true") @@ -49,17 +39,19 @@ class DocumentProcessor: """Processes .IPYNB notebook documents.""" - def __init__(self, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True): + def __init__(self, filepath, chunk_overlap=CHUNK_OVERLAP, chunk_size=CHUNK_SIZE, verbose=True, similarity_threshold=SIMILARITY_THRESHOLD, file_id=None): """Param : filepath to the notebook document""" self.filepath = filepath self.filename = self.filepath.split("/")[-1] # might not work on windows? + self.file_id = file_id or self.filename.split("_")[1] # assumes files are named like "Homework 4_USERNAME_more_stuff.ipynb" ... todo: regex instead, to be more precise - self.chunk_overlap = chunk_overlap - self.chunk_size = chunk_size + self.chunk_overlap = int(chunk_overlap) + self.chunk_size = int(chunk_size) self.embeddings_model_name = "text-embedding-ada-002" #self.faiss_index = self.filepath.upper().replace(".IPYNB", "") + "_FAISS_INDEX" + self.similarity_threshold = float(similarity_threshold) self.verbose = bool(verbose) if self.verbose: @@ -73,12 +65,11 @@ def docs(self): print("LOADING...") docs = loader.load() - if self.verbose: - print("DOCS:", len(docs)) + #if self.verbose: + # print("DOCS:", len(docs)) assert len(docs) == 1 # right? let's see if this is never the case return docs - @cached_property def doc(self): return self.docs[0] @@ -88,37 +79,53 @@ def doc(self): @cached_property def cells(self): cell_docs = [] - cell_texts = split_text_by_substrings(str(self.doc.page_content), TEXT_CELL_PREFIX, CODE_CELL_PREFIX) + cell_texts = split_text_by_substrings(str(self.doc.page_content)) for i, cell_text in enumerate(cell_texts): cell_metadata = { #"filepath": self.filepath, + "file_id": self.file_id, "filename": self.filename, "cell_id": i+1, - "cell_type": parse_cell_type(cell_text), + #"cell_type": parse_cell_type(cell_text), "cell_length": len(cell_text) } #chunk.metadata = {**chunk.metadata, **cell_metadata} # dict merge - doc = Document(page_content=cell_text, metadata=cell_metadata) + #doc = Document(page_content=cell_text, metadata=cell_metadata) + doc = Cell(page_content=cell_text, metadata=cell_metadata) cell_docs.append(doc) return cell_docs @cached_property def text_cells(self): - return [cell for cell in self.cells if cell.metadata["cell_type"] == "TEXT"] + #return [cell for cell in self.cells if cell.metadata["cell_type"] == "TEXT"] + return [cell for cell in self.cells if cell.is_text] @cached_property def code_cells(self): - return [cell for cell in self.cells if cell.metadata["cell_type"] == "CODE"] + #return [cell for cell in self.cells if cell.metadata["cell_type"] == "CODE"] + return [cell for cell in self.cells if cell.is_code] @cached_property def cells_df(self): - records = [] - for cell in self.cells: - metadata = cell.metadata - metadata["page_content"] = cell.page_content - records.append(metadata) - df = DataFrame(records) - #df.index = df["cell_id"] - return df + return documents_to_df(self.cells) + + def plot_cell_lengths(self, fig_show=FIG_SHOW, height=500): + title = f"Cell Lengths" + #subtitle = f"Text Cells: {len(self.text_cells)} | Code Cells: {len(self.code_cells)}" + subtitle = f"Document: {self.filename} | Text Cells: {len(self.text_cells)} | Code Cells: {len(self.code_cells)}" + title += f"
{subtitle}" + + fig = px.violin(self.cells_df, x="cell_length", facet_row="cell_type", + color="cell_type", color_discrete_map=CELL_COLORS_MAP, + title=title, height=height, points="all", box=True, + ) + #fig.add_annotation(text= (f"Document: {self.filepath}"), + # font=dict(size=10, color="grey"), align="left", showarrow=False, + # x= 0, xref='paper', xanchor='left', xshift=-1, + # y= -0.15, yref='paper', yanchor='bottom', yshift=-5, + #) + + if fig_show: + fig.show() # CHUNKS (TEXT VS CODE): @@ -158,37 +165,9 @@ def code_chunks(self): @cached_property def chunks_df(self): - # consider adding the page contents as well - #return DataFrame([chunk.metadata for chunk in self.chunks]) - records = [] - for chunk in self.chunks: - metadata = chunk.metadata - metadata["page_content"] = chunk.page_content - records.append(metadata) - return DataFrame(records) - - # PLOTTING: - - def plot_cell_lengths(self, fig_show=True, height=500): - title = f"Cell Lengths" - #subtitle = f"Text Cells: {len(self.text_cells)} | Code Cells: {len(self.code_cells)}" - subtitle = f"Document: {self.filename} | Text Cells: {len(self.text_cells)} | Code Cells: {len(self.code_cells)}" - title += f"
{subtitle}" - - fig = px.violin(self.cells_df, x="cell_length", facet_row="cell_type", - color="cell_type", color_discrete_map=CELL_COLORS_MAP, - title=title, height=height, points="all", box=True, - ) - #fig.add_annotation(text= (f"Document: {self.filepath}"), - # font=dict(size=10, color="grey"), align="left", showarrow=False, - # x= 0, xref='paper', xanchor='left', xshift=-1, - # y= -0.15, yref='paper', yanchor='bottom', yshift=-5, - #) - - if fig_show: - fig.show() + return documents_to_df(self.chunks) - def plot_chunk_lengths(self, fig_show=True, height=500): + def plot_chunk_lengths(self, fig_show=FIG_SHOW, height=500): title = f"Chunk Lengths ({self.chunk_size} chars max, {self.chunk_overlap} chars overlap)" #subtitle = f"Text Chunks: {len(self.text_chunks)} | Code Chunks: {len(self.code_chunks)}" + f" | Document: {self.filename}" subtitle=f"Document: {self.filename} | Text Chunks: {len(self.text_chunks)} | Code Chunks: {len(self.code_chunks)}" @@ -203,14 +182,13 @@ def plot_chunk_lengths(self, fig_show=True, height=500): if fig_show: fig.show() - - # EMBEDDINGS: + # EMBEDDINGS / RELEVANCE FILTERING: @property def embeddings_model(self): model = OpenAIEmbeddings(model=self.embeddings_model_name) - if self.verbose: - print(model) + #if self.verbose: + # print(type(model)) return model def make_retriever(self, cell_type="TEXT", storage_strategy="chunks"): @@ -248,3 +226,19 @@ def code_retriever(self): @cached_property def text_retriever(self): return self.make_retriever(cell_type="TEXT") + + # COMPRESSION: + + def make_compression_retriever(self, base_retriever, similarity_threshold=0.75): + compressor = EmbeddingsFilter(embeddings=self.embeddings_model, similarity_threshold=similarity_threshold) + compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=base_retriever) + #relevant_docs = compression_retriever.get_relevant_documents(STUDENT_QUERY) + return compression_retriever + + @cached_property + def text_compression_retriever(self): + return self.make_compression_retriever(self.text_retriever, self.similarity_threshold) + + @cached_property + def code_compression_retriever(self): + return self.make_compression_retriever(self.code_retriever, self.similarity_threshold) diff --git a/app/jobs/starter.py b/app/jobs/starter.py deleted file mode 100644 index 0879988..0000000 --- a/app/jobs/starter.py +++ /dev/null @@ -1,45 +0,0 @@ - -from app.submissions_manager import SubmissionsManager -from app.document_processor import DocumentProcessor, print_docs - - -if __name__ == "__main__": - - print("---------------") - print("SUBMISSIONS...") - sm = SubmissionsManager() - print(sm.dirpath) - print(len(sm.filenames)) - - print("---------------") - print("STARTER DOC...") - starter_filepath = sm.find_filepath(substr="STARTER") - dp = DocumentProcessor(starter_filepath) - #starter_doc = dp.doc - print("DOC(S):", len(dp.docs)) - - print("CELLS:", len(dp.cells)) - #print(dp.cells) - print(dp.cells_df.shape) - print(dp.cells_df.head()) - print(dp.cells_df.groupby("cell_type")["cell_length"].describe()) - - print("---------------") - print(f"TEXT CELLS ({len(dp.text_cells)}):") - print_docs(dp.text_cells) - - print("---------------") - print(f"CODE CELLS ({len(dp.code_cells)}):") - print_docs(dp.code_cells) - - dp.plot_cell_lengths() - - print("----------") - print("CHUNKS:", len(dp.chunks)) - print(dp.chunks_df.head()) - - dp.plot_chunk_lengths() - #starter_dp.chunks_df.drop(columns="source").to_csv("hw_4_cell_chunks.csv") - #starter_dp.chunks_df.drop(columns="source").head() - - #starter_dp.retriever diff --git a/app/jobs/submissions.py b/app/jobs/submissions.py deleted file mode 100644 index 9460dde..0000000 --- a/app/jobs/submissions.py +++ /dev/null @@ -1,128 +0,0 @@ -from warnings import filterwarnings -filterwarnings("ignore") - -from pandas import DataFrame, merge -import plotly.express as px - -from app.submissions_manager import SubmissionsManager -from app.document_processor import DocumentProcessor, print_docs -from app.colors import CELL_COLORS_MAP - -EMPTY_CODE_CELL = "'code' cell: '[]'" -EMPTY_TEXT_CELL = "'markdown' cell: '[]'" - - -if __name__ == "__main__": - - sm = SubmissionsManager() - print(sm.dirpath) - print(len(sm.filenames)) - - # STARTER NOTEBOOK: - - starter_filepath = sm.find_filepath("STARTER") - print(starter_filepath) - starter_dp = DocumentProcessor(starter_filepath) - starter_cells = starter_dp.cells - - # ALL NOTEBOOKS: - - all_cells = [] - records = [] - for filepath in sm.filepaths: - dp = DocumentProcessor(filepath, verbose=False) - avg_lengths = dp.cells_df.groupby("cell_type")["cell_length"].mean() - record = { - "notebook": dp.filename, - "length": len(dp.doc.page_content), #dp.docs_df["cell_length"].sum(), - "cells": len(dp.cells), - "code_cells": len(dp.code_cells), - "text_cells": len(dp.text_cells), - "code_avg_length": avg_lengths["CODE"].round(1), - "text_avg_length": avg_lengths["TEXT"].round(1), - - } - records.append(record) - all_cells += dp.cells - print(len(records)) - print(len(all_cells)) - - notebooks_df = DataFrame(records) - notebooks_df.index = notebooks_df["notebook"] - notebooks_df.drop(columns=["notebook"], inplace=True) - #notebooks_df.to_csv("hw_4_notebooks.csv") - #notebooks_df.head() - - chart_df = notebooks_df.copy() - chart_df["filename"] = chart_df.index - fig = px.violin(chart_df, x="length", box=True, points="all", height=400, - title="Document Lengths (All Submissions)", - hover_data=["filename"] - ) - #fig.show() - - # ALL CELLS - - cells_df = DataFrame([cell.metadata for cell in all_cells]) - - cells_df['dup_content'] = cells_df.duplicated(subset='page_content', keep=False) - print(cells_df["dup_content"].value_counts()) - - starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ] - cells_df = merge(cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter')) - cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True) - cells_df["starter_content"] = cells_df['starter_cell_id'].notna() - print(cells_df["starter_content"].value_counts()) - - - print("NON-STARTER DUP CELLS:") - nonstarter_dups = cells_df[ (cells_df["dup_content"] == True) & (cells_df["starter_content"] == False) ] - for i, row in nonstarter_dups.iterrows(): - if row["page_content"].strip() not in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL]: - print("----") - #print(row["filename"][0:25], row["cell_id"]) - print(row["page_content"][0:250]) - - #cells_df.to_csv("cells.csv", index=False) - #print(all_cells_df.shape) - #all_cells_df.head() - - chart_df = cells_df.copy() - chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K - print(len(chart_df)) - fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, - title="Cell Lengths (All Submissions)", - hover_data=["page_content"], facet_row="cell_type", - color="cell_type", color_discrete_map=CELL_COLORS_MAP - ) - fig.show() - - # cells_df[cells_df["page_content"].str.contains(" with output: ") ] - - # NON-STARTER CELLS: - - chart_df = cells_df.copy() - chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K - chart_df = chart_df[chart_df["starter_content"] == False] - print(len(chart_df)) - fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, - title="Non-Starter Cell Lengths (All Submissions)", - hover_data=["page_content"], facet_row="cell_type", - color="cell_type", color_discrete_map=CELL_COLORS_MAP - ) - fig.show() - - # UNIQUE CELLS - - #cells_df.groupby(["cell_type", "dup_content"])["cell_length"].describe() - - chart_df = cells_df.copy() - chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K - chart_df = chart_df[chart_df["dup_content"] == False] - print(len(chart_df)) - fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, - title="Unique Cell Lengths (All Submissions)", - hover_data=["page_content"], facet_row="cell_type", - color="cell_type", color_discrete_map=CELL_COLORS_MAP - ) - fig.show() diff --git a/app/prompts.py b/app/prompts.py new file mode 100644 index 0000000..21bcdb3 --- /dev/null +++ b/app/prompts.py @@ -0,0 +1,6 @@ + + + + + +STUDENT_QUERY = "What is the student's name? What is their GW ID?" diff --git a/app/starter_doc_processor.py b/app/starter_doc_processor.py new file mode 100644 index 0000000..e97ef9d --- /dev/null +++ b/app/starter_doc_processor.py @@ -0,0 +1,76 @@ +from warnings import filterwarnings +filterwarnings("ignore") + +from app.prompts import STUDENT_QUERY +from app.submissions_manager import SubmissionsManager +from app.document_processor import DocumentProcessor +from app.document_formatting import print_docs + +#from pandas import pivot_table + + + +def print_relevant_cells(cells): + total_length = 0 + for doc in cells: + print("\n") + print("CELL:", doc.metadata["cell_id"], "CHUNK:", doc.metadata["chunk_id"]) + print("LENGTH:", len(doc.page_content)) + print("CONTENT:", doc.page_content) + + total_length += len(doc.page_content) + + print("\n") + print("TOTAL LENGTH:", total_length) + #return total_length + + +if __name__ == "__main__": + + print("---------------") + print("SUBMISSIONS...") + sm = SubmissionsManager() + print(sm.dirpath) + print(len(sm.filenames)) + starter_filepath = sm.find_filepath(substr="STARTER") + + print("---------------") + print("STARTER DOC...") + dp = DocumentProcessor(starter_filepath) + + print("CELLS:", len(dp.cells)) + print("AVG LENGTHS:", dp.cells_df.groupby("cell_type")["cell_length"].mean()) + #print(dp.cells_df.groupby("cell_type")["cell_length"].describe()) + #pivot_table(dp.cells_df, index="cell_type", columns=["..."]) + + print("---------------") + print(f"TEXT CELLS ({len(dp.text_cells)}):") + print_docs(dp.text_cells) + print("---------------") + print(f"CODE CELLS ({len(dp.code_cells)}):") + print_docs(dp.code_cells) + dp.plot_cell_lengths() + + print("----------") + print("CHUNKS:", len(dp.chunks)) + print("AVG LENGTHS:", dp.chunks_df.groupby("cell_type")["chunk_length"].mean()) + #print_docs(dp.chunks) + dp.plot_chunk_lengths() + + keep_going = input("CONTINUE TO QUERYING AND RETRIEVAL? ('Y'/'N'): ").upper() or "N" + if keep_going != "Y": + exit() + + print("QUERY:", STUDENT_QUERY) + + relevant_docs = dp.text_retriever.get_relevant_documents(STUDENT_QUERY) + print("-----------") + print(f"RELEVANT DOCS ({len(relevant_docs)}):") + print_relevant_cells(relevant_docs) + + compressed_docs = dp.text_compression_retriever.get_relevant_documents(STUDENT_QUERY) + print("-----------") + print(f"COMPRESSED DOCS: ({len(compressed_docs)}):") + print_relevant_cells(compressed_docs) + + #breakpoint() diff --git a/app/submissions_manager.py b/app/submissions_manager.py index 781d83d..45cdb0d 100644 --- a/app/submissions_manager.py +++ b/app/submissions_manager.py @@ -15,9 +15,10 @@ class SubmissionsManager: - def __init__(self, dirpath=SUBMISSIONS_DIRPATH, file_ext=".IPYNB"): + def __init__(self, dirpath=SUBMISSIONS_DIRPATH, file_ext=".IPYNB", starter_filename=None): self.dirpath = dirpath self.file_ext = file_ext + self.starter_filename = starter_filename @cached_property def filenames(self): @@ -34,9 +35,12 @@ def find_filepath(self, substr): return None - #@cached_property - #def starter_filepath(self): - # return self.find_filepath("STARTER") + @cached_property + def starter_filepath(self): + if self.starter_filename: + return os.path.join(self.dirpath, self.starter_filename) + else: + return self.find_filepath("STARTER") @@ -44,9 +48,6 @@ def find_filepath(self, substr): sm = SubmissionsManager() - print(sm.dirpath) - print(len(sm.filenames)) - - starter_filepath = sm.find_filepath("STARTER") - - print(starter_filepath) + print("SUBMISSIONS DIRPATH:", sm.dirpath) + print("FILES:", len(sm.filenames)) + print("STARTER DOC:", sm.starter_filepath) diff --git a/app/submissions_processor.py b/app/submissions_processor.py new file mode 100644 index 0000000..ab5c9ae --- /dev/null +++ b/app/submissions_processor.py @@ -0,0 +1,219 @@ +from warnings import filterwarnings +filterwarnings("ignore") + +import os +from pandas import DataFrame, merge +import plotly.express as px + +from app import RESULTS_DIRPATH +from app.colors import CELL_COLORS_MAP +from app.submissions_manager import SubmissionsManager, SUBMISSIONS_DIRPATH +from app.document_processor import DocumentProcessor, FIG_SHOW +#from app.document_formatting import print_docs, print_rows + + +class SubmissionsProcessor: + + def __init__(self, dirpath=SUBMISSIONS_DIRPATH, starter_filename=None, results_dirpath=RESULTS_DIRPATH): + """Can use a starter file, or not.""" + + self.submissions_dirpath = dirpath + self.starter_filename = starter_filename + + self.results_dirpath = results_dirpath or self.submissions_dirpath + self.notebooks_csv_filepath = os.path.join(self.results_dirpath, "notebooks.csv") + self.cells_csv_filepath = os.path.join(self.results_dirpath, "cells.csv") + + # get all submision files (consider passing them in for a looser coupling with the manager class): + self.sm = SubmissionsManager(self.submissions_dirpath, starter_filename=self.starter_filename) + print("SUBMISSIONS DIR:", self.sm.dirpath) + print("FILES:", len(self.sm.filenames)) + self.submission_filepaths = self.sm.filepaths + + # available post processing: + self.starter_dp = None + self.notebooks_df = None + self.cells_df = None + + + def perform(self): + """Processes all submission documents. Compares them relative to eachother and the starter document. + Produces a CSV file of document statistics, as well as a CSV file of cell contents and metadata. + """ + + all_cells = [] + records = [] + for filepath in self.submission_filepaths: + dp = DocumentProcessor(filepath, verbose=False) + avg_lengths = dp.cells_df.groupby("cell_type")["cell_length"].mean() + record = { + "filename": dp.filename, + "file_id": dp.file_id, + "length": len(dp.doc.page_content), #dp.docs_df["cell_length"].sum(), + "cells": len(dp.cells), + "code_cells": len(dp.code_cells), + "text_cells": len(dp.text_cells), + "avg_code_cell_length": avg_lengths["CODE"].round(1), + "avg_text_cell_length": avg_lengths["TEXT"].round(1), + } + records.append(record) + all_cells += dp.cells + + print("------") + print("NOTEBOOKS:", len(records)) + self.notebooks_df = DataFrame(records) + #notebooks_df.index = notebooks_df["filename"] + #notebooks_df["file_id"] = notebooks_df["filename"].apply(lambda filename: filename.split("_")[1]) # todo: regex instead, to be more precise + + self.notebooks_df.to_csv(self.notebooks_csv_filepath, index=False) + + print("------") + print("CELLS:", len(all_cells)) + self.cells_df = DataFrame([cell.metadata for cell in all_cells]) + + print("------") + print("DUPLICATE CELLS:") + self.cells_df['dup_content'] = self.cells_df.duplicated(subset='page_content', keep=False) + print(self.cells_df["dup_content"].value_counts()) + + print("------") + print("STARTER CELLS:") # (~30% of cells are the same as starter cells) + #starter_rows = cells_df[ cells_df["filename"].str.contains("STARTER") ] + if self.starter_filename: + starter_rows = self.cells_df[ self.cells_df["filename"] == self.starter_filename ] + self.cells_df = merge(self.cells_df, starter_rows[["cell_id", "page_content"]], how='left', on='page_content', suffixes=('', '_starter')) + self.cells_df.rename(columns={"cell_id_starter": "starter_cell_id"}, inplace=True) + #self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna() + #print(self.cells_df["starter_content"].value_counts()) + else: + self.cells_df["starter_cell_id"] = None + #self.cells_df["starter_content"] = False + self.cells_df["starter_content"] = self.cells_df['starter_cell_id'].notna() + print(self.cells_df["starter_content"].value_counts()) + + print("------") + print("EMPTY CELLS:") + print(self.cells_df["is_empty"].value_counts()) + + #print("------") + #print("NON-DUPLICATE NON-STARTER NON-BLANK CELLS:") + #dup_rows = self.cells_df[ (self.cells_df["starter_content"] == False) & (self.cells_df["dup_content"] == True) & (self.cells_df["is_empty"] == False)].sort_values(by="page_content") + #print_rows(dup_rows) + + self.cells_df.to_csv(self.cells_csv_filepath, index=False) + + + def plot_documents(self, fig_show=FIG_SHOW): + + print("------") + print("PLOTTING...") + + # PLOTTING: ORIGINAL DOCUMENTS + + chart_df = self.notebooks_df.copy() + #chart_df["filename"] = chart_df.index + avg_length = chart_df.groupby('filename')['length'].mean().mean() + title = "Document Lengths (All Content)" + title += f"
Documents: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" + fig = px.violin(chart_df, x="length", box=True, points="all", height=400, title=title, + hover_data=["file_id", "filename"] # "file_id", + ) + if fig_show: + fig.show() + + # PLOTTING: DOCUMENTS (UNIQUE CONTENT ONLY) + + chart_df = self.cells_df.copy() + chart_df = chart_df[chart_df["dup_content"] == False] + chart_df = chart_df[chart_df["starter_content"] == False] + chart_df = chart_df[chart_df["is_empty"] == False] + #chart_pivot = chart_df.groupby("filename")["cell_length"].sum() + #chart_pivot = chart_pivot.to_frame().rename(columns={"cell_length": "length"}) + #chart_pivot["filename"] = chart_pivot.index + chart_pivot = chart_df.groupby(["file_id", "filename"])["cell_length"].sum() + chart_pivot = chart_pivot.to_frame().rename(columns={"cell_length": "length"}) + chart_pivot.reset_index(inplace=True) # convert multi-index to columns, https://stackoverflow.com/a/25733562/670433 + avg_length = chart_pivot['length'].mean() + title = "Document Lengths (Unique Content Only)" + title += f"
Documents: {len(chart_pivot):,.0f} | Avg Length: {avg_length:,.0f} chars" + fig = px.violin(chart_pivot, x="length", box=True, points="all", height=400, title=title, + hover_data=["file_id", "filename"] + ) + if fig_show: + fig.show() + + + def plot_cells(self, fig_show=FIG_SHOW): + + # PLOTTING: CELLS (ALL) + + chart_df = self.cells_df.copy() + chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K + avg_length = chart_df["cell_length"].mean() + title = "Cell Lengths (All Content)" + title += f"
Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" + fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title, + hover_data=["page_content"], facet_row="cell_type", + color="cell_type", color_discrete_map=CELL_COLORS_MAP + ) + if fig_show: + fig.show() + + # PLOTTING: CELLS (UNIQUE) + + chart_df = self.cells_df.copy() + chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K + chart_df = chart_df[chart_df["dup_content"] == False] + chart_df = chart_df[chart_df["starter_content"] == False] + chart_df = chart_df[chart_df["is_empty"] == False] + avg_length = chart_df["cell_length"].mean() + title = "Cell Lengths (Unique Content Only)" + title += f"
Cells: {len(chart_df):,.0f} | Avg Length: {avg_length:,.0f} chars" + fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, title=title, + hover_data=["page_content"], facet_row="cell_type", + color="cell_type", color_discrete_map=CELL_COLORS_MAP + ) + if fig_show: + fig.show() + + + #print("NON-STARTER DUP CELLS:") + #nonstarter_dups = cells_df[ (cells_df["dup_content"] == True) & (cells_df["starter_content"] == False) ] + #for i, row in nonstarter_dups.iterrows(): + # if row["page_content"].strip() not in [EMPTY_CODE_CELL, EMPTY_TEXT_CELL]: + # print("----") + # #print(row["filename"][0:25], row["cell_id"]) + # print(row["page_content"][0:250]) + + #cells_df.to_csv("cells.csv", index=False) + #print(all_cells_df.shape) + #all_cells_df.head() + + + # cells_df[cells_df["page_content"].str.contains(" with output: ") ] + + # NON-STARTER CELLS: + + #chart_df = cells_df.copy() + #chart_df = chart_df[chart_df["cell_length"] <= 10_000] # filter out two outliers 25K, 30K + #chart_df = chart_df[chart_df["starter_content"] == False] + #fig = px.violin(chart_df, x="cell_length", box=True, points="all", height=500, + # title="Non-Starter Cell Lengths (All Submissions)", + # hover_data=["page_content"], facet_row="cell_type", + # color="cell_type", color_discrete_map=CELL_COLORS_MAP + #) + #fig.show() + + + + + + + +if __name__ == "__main__": + + sp = SubmissionsProcessor() + + sp.perform() + sp.plot_documents() + sp.plot_cells() diff --git a/conftest.py b/conftest.py index e69de29..2614388 100644 --- a/conftest.py +++ b/conftest.py @@ -0,0 +1,7 @@ + + + +import os + +TEST_DOCS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "documents") +TEST_RESULTS_DIRPATH = os.path.join(os.path.dirname(__file__), "test", "results") diff --git a/results/.gitkeep b/results/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/test/cell_test.py b/test/cell_test.py new file mode 100644 index 0000000..ff39098 --- /dev/null +++ b/test/cell_test.py @@ -0,0 +1,29 @@ +#from langchain.docstore.document import Document + +from app.cell import Cell + + +def test_cell_type(): + + # a cell is a wrapper for a document, which knows its cell type and has shortcut properties into the metadata: + code_cell_content = "'code' cell: '['import numpy as np']'" + text_cell_content = "'markdown' cell: '['If you now insert your cursor after `np` and press **Period**(`.`), you will see the list of available completions within the `np` module. Completions can be opened again by using **Ctrl+Space**.']'", + + # metadata and cell type: + + cell = Cell(page_content=code_cell_content) + assert cell.metadata == {"cell_type":"CODE", "is_empty": False} + assert cell.cell_type == "CODE" + + cell = Cell(page_content=code_cell_content, metadata={"number": 99}) + assert cell.metadata == {"number": 99, "cell_type":"CODE", "is_empty": False} + assert cell.cell_type == "CODE" + + cell = Cell(page_content=text_cell_content, metadata={"number": 99}) + assert cell.metadata == {"number": 99, "cell_type":"TEXT", "is_empty": False} + assert cell.cell_type == "TEXT" + + # empty: + + assert Cell(page_content="'code' cell: '[]'").is_empty + assert Cell(page_content="'markdown' cell: '[]'").is_empty diff --git a/test/document_processor_test.py b/test/document_processor_test.py index a799429..b900023 100644 --- a/test/document_processor_test.py +++ b/test/document_processor_test.py @@ -11,8 +11,7 @@ def test_document_processor(): #notebook_filepath = os.path.join(os.path.dirname(__file__), "documents", "Making_the_Most_of_your_Colab_Subscription.ipynb") notebook_filepath = os.path.join(os.path.dirname(__file__), "documents", "Overview_of_Colaboratory_Features.ipynb") - dp = DocumentProcessor(notebook_filepath) - cells = dp.cells + dp = DocumentProcessor(notebook_filepath, file_id="COLAB_FEATURES") assert len(dp.docs) == 1 assert dp.doc == dp.docs[0] @@ -24,14 +23,17 @@ def test_document_processor(): assert len(dp.text_cells) == 18 assert len(dp.code_cells) == 4 - assert isinstance(dp.cells[0], Document) - assert dp.cells[0].metadata == { + cell = dp.cells[0] + assert isinstance(cell, Document) + assert cell.metadata == { 'filename': 'Overview_of_Colaboratory_Features.ipynb', + 'file_id': "COLAB_FEATURES", 'cell_id': 1, 'cell_type': 'TEXT', - 'cell_length': 164 + 'cell_length': 164, + 'is_empty': False } - assert dp.cells[0].page_content == "'markdown' cell: '['# Cells', 'A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.']'" + assert cell.page_content == "'markdown' cell: '['# Cells', 'A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.']'" assert [cell.page_content for cell in dp.cells] == [ "'markdown' cell: '['# Cells', 'A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.']'", @@ -58,11 +60,13 @@ def test_document_processor(): "'markdown' cell: '['## Commenting on a cell', 'You can comment on a Colaboratory notebook like you would on a Google Document. Comments are attached to cells, and are displayed next to the cell they refer to. If you have **comment-only** permissions, you will see a comment button on the top right of the cell when you hover over it.', '', 'If you have edit or comment permissions you can comment on a cell in one of three ways:', '', '1. Select a cell and click the comment button in the toolbar above the top-right corner of the cell.', '1. Right click a text cell and select **Add a comment** from the context menu.', '3. Use the shortcut **Ctrl+Shift+M** to add a comment to the currently selected cell.', '', 'You can resolve and reply to comments, and you can target comments to specific collaborators by typing *+[email address]* (e.g., `+user@domain.com`). Addressed collaborators will be emailed.', '', 'The Comment button in the top-right corner of the page shows all comments attached to the notebook.']'" ] - assert len(dp.cells_df) == 22 + assert len(dp.cells_df) == len(dp.cells) assert isinstance(dp.cells_df, DataFrame) - assert dp.cells_df.columns.tolist() == ['filename', 'cell_id', 'cell_type', 'cell_length', 'page_content'] + assert sorted(dp.cells_df.columns.tolist()) == ['cell_id', 'cell_length', 'cell_type', 'file_id', 'filename', 'is_empty', 'page_content'] # CHUNKS - assert dp.chunks_df.columns.tolist() == ['filename', 'cell_id', 'cell_type', 'cell_length', 'page_content', 'chunk_id', 'chunk_length'] + assert len(dp.chunks) == 23 # more than the number of cells + assert len(dp.chunks_df) == len(dp.chunks) + assert sorted(dp.chunks_df.columns.tolist()) == ['cell_id', 'cell_length', 'cell_type', 'chunk_id', 'chunk_length', 'file_id', 'filename', 'is_empty','page_content'] diff --git a/test/results/cells.csv b/test/results/cells.csv new file mode 100644 index 0000000..8229b74 --- /dev/null +++ b/test/results/cells.csv @@ -0,0 +1,34 @@ +file_id,filename,cell_id,cell_length,cell_type,is_empty,page_content,dup_content,starter_cell_id,starter_content +the,Making_the_Most_of_your_Colab_Subscription.ipynb,1,71,TEXT,False,"'markdown' cell: '['# Making the Most of your Colab Subscription', '']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,2,798,TEXT,False,"'markdown' cell: '['## Faster GPUs', '', ""Users who have purchased one of Colab's paid plans have access to premium GPUs. You can upgrade your notebook's GPU settings in `Runtime > Change runtime type` in the menu to enable Premium accelerator. Subject to availability, selecting a premium GPU may grant you access to a V100 or A100 Nvidia GPU."", '', ""The free of charge version of Colab grants access to Nvidia's T4 GPUs subject to quota restrictions and availability."", '', 'You can see what GPU you\'ve been assigned at any time by executing the following cell. If the execution result of running the code cell below is ""Not connected to a GPU"", you can change the runtime by going to `Runtime > Change runtime type` in the menu to enable a GPU accelerator, and then re-execute the code cell.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,3,250,CODE,False,"'code' cell: '['gpu_info = !nvidia-smi', ""gpu_info = '\\n'.join(gpu_info)"", ""if gpu_info.find('failed') >= 0:"", "" print('Not connected to a GPU')"", 'else:', ' print(gpu_info)']' with output: '['/bin/bash: line 1: nvidia-smi: command not found\n']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,4,168,TEXT,False,"'markdown' cell: '['In order to use a GPU with your notebook, select the `Runtime > Change runtime type` menu, and then set the hardware accelerator dropdown to GPU.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,5,524,TEXT,False,"'markdown' cell: '['## More memory', '', ""Users who have purchased one of Colab's paid plans have access to high-memory VMs when they are available."", '', '', '', 'You can see how much memory you have available at any time by running the following code cell. If the execution result of running the code cell below is ""Not using a high-RAM runtime"", then you can enable a high-RAM runtime via `Runtime > Change runtime type` in the menu. Then select High-RAM in the Runtime shape dropdown. After, re-execute the code cell.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,6,408,CODE,False,"'code' cell: '['from psutil import virtual_memory', 'ram_gb = virtual_memory().total / 1e9', ""print('Your runtime has {:.1f} gigabytes of available RAM\\n'.format(ram_gb))"", '', 'if ram_gb < 20:', "" print('Not using a high-RAM runtime')"", 'else:', "" print('You are using a high-RAM runtime!')""]' with output: '['Your runtime has 13.6 gigabytes of available RAM\n', '\n', 'Not using a high-RAM runtime\n']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,7,516,TEXT,False,"'markdown' cell: '['## Longer runtimes', '', ""All Colab runtimes are reset after some period of time (which is faster if the runtime isn't executing code). Colab Pro and Pro+ users have access to longer runtimes than those who use Colab free of charge."", '', '## Background execution', '', ""Colab Pro+ users have access to background execution, where notebooks will continue executing even after you've closed a browser tab. This is always enabled in Pro+ runtimes as long as you have compute units available."", '']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,8,521,TEXT,False,"'markdown' cell: '['## Relaxing resource limits in Colab Pro', '', ""Your resources are not unlimited in Colab. To make the most of Colab, avoid using resources when you don't need them. For example, only use a GPU when required and close Colab tabs when finished."", '', '', '', 'If you encounter limitations, you can relax those limitations by purchasing more compute units via Pay As You Go. Anyone can purchase compute units via [Pay As You Go](https://colab.research.google.com/signup); no subscription is required.']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,9,438,TEXT,False,"'markdown' cell: '['## Send us feedback!', '', ""If you have any feedback for us, please let us know. The best way to send feedback is by using the Help > 'Send feedback...' menu. If you encounter usage limits in Colab Pro consider subscribing to Pro+."", '', 'If you encounter errors or other issues with billing (payments) for Colab Pro, Pro+, or Pay As You Go, please email [colab-billing@google.com](mailto:colab-billing@google.com).']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,10,1600,TEXT,False,"'markdown' cell: '['## More Resources', '', '### Working with Notebooks in Colab', '- [Overview of Colaboratory](/notebooks/basic_features_overview.ipynb)', '- [Guide to Markdown](/notebooks/markdown_guide.ipynb)', '- [Importing libraries and installing dependencies](/notebooks/snippets/importing_libraries.ipynb)', '- [Saving and loading notebooks in GitHub](https://colab.research.google.com/github/googlecolab/colabtools/blob/main/notebooks/colab-github-demo.ipynb)', '- [Interactive forms](/notebooks/forms.ipynb)', '- [Interactive widgets](/notebooks/widgets.ipynb)', '', '', '### Working with Data', '- [Loading data: Drive, Sheets, and Google Cloud Storage](/notebooks/io.ipynb)', '- [Charts: visualizing data](/notebooks/charts.ipynb)', '- [Getting started with BigQuery](/notebooks/bigquery.ipynb)', '', '### Machine Learning Crash Course', ""These are a few of the notebooks from Google's online Machine Learning course. See the [full course website](https://developers.google.com/machine-learning/crash-course/) for more."", '- [Intro to Pandas DataFrame](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/cc/exercises/pandas_dataframe_ultraquick_tutorial.ipynb)', '- [Linear regression with tf.keras using synthetic data](https://colab.research.google.com/github/google/eng-edu/blob/main/ml/cc/exercises/linear_regression_with_synthetic_data.ipynb)', '', '', '', '### Using Accelerated Hardware', '- [TensorFlow with GPUs](/notebooks/gpu.ipynb)', '- [TensorFlow with TPUs](/notebooks/tpu.ipynb)']'",False,,False +the,Making_the_Most_of_your_Colab_Subscription.ipynb,11,1171,TEXT,False,"'markdown' cell: '['', '', '## Machine Learning Examples', '', 'To see end-to-end examples of the interactive machine learning analyses that Colaboratory makes possible, check out these tutorials using models from [TensorFlow Hub](https://tfhub.dev).', '', 'A few featured examples:', '', '- [Retraining an Image Classifier](https://tensorflow.org/hub/tutorials/tf2_image_retraining): Build a Keras model on top of a pre-trained image classifier to distinguish flowers.', '- [Text Classification](https://tensorflow.org/hub/tutorials/tf2_text_classification): Classify IMDB movie reviews as either *positive* or *negative*.', '- [Style Transfer](https://tensorflow.org/hub/tutorials/tf2_arbitrary_image_stylization): Use deep learning to transfer style between images.', '- [Multilingual Universal Sentence Encoder Q&A](https://tensorflow.org/hub/tutorials/retrieval_with_tf_hub_universal_encoder_qa): Use a machine learning model to answer questions from the SQuAD dataset.', '- [Video Interpolation](https://tensorflow.org/hub/tutorials/tweening_conv3d): Predict what happened in a video between the first and the last frame.']'",False,,False +of,Overview_of_Colaboratory_Features.ipynb,1,164,TEXT,False,"'markdown' cell: '['# Cells', 'A notebook is a list of cells. Cells contain either explanatory text or executable code and its output. Click a cell to select it.']'",False,1.0,True +of,Overview_of_Colaboratory_Features.ipynb,2,595,TEXT,False,"'markdown' cell: '['## Code cells', 'Below is a **code cell**. Once the toolbar button indicates CONNECTED, click in the cell to select it and execute the contents in the following ways:', '', '* Click the **Play icon** in the left gutter of the cell;', '* Type **Cmd/Ctrl+Enter** to run the cell in place;', '* Type **Shift+Enter** to run the cell and move focus to the next cell (adding one if none exists); or', '* Type **Alt+Enter** to run the cell and insert a new code cell immediately below it.', '', 'There are additional options for running some or all cells in the **Runtime** menu.']'",False,2.0,True +of,Overview_of_Colaboratory_Features.ipynb,3,497,TEXT,False,"'markdown' cell: '['## Text cells', 'This is a **text cell**. You can **double-click** to edit this cell. Text cells', 'use markdown syntax. To learn more, see our [markdown', 'guide](/notebooks/markdown_guide.ipynb).', '', 'You can also add math to text cells using [LaTeX](http://www.latex-project.org/)', 'to be rendered by [MathJax](https://www.mathjax.org). Just place the statement', 'within a pair of **\\$** signs. For example `$\\sqrt{3x-1}+(1+x)^2$` becomes', '$\\sqrt{3x-1}+(1+x)^2.$']'",False,3.0,True +of,Overview_of_Colaboratory_Features.ipynb,4,706,TEXT,False,"'markdown' cell: '['## Adding and moving cells', 'You can add new cells by using the **+ CODE** and **+ TEXT** buttons that show when you hover between cells. These buttons are also in the toolbar above the notebook where they can be used to add a cell below the currently selected cell.', '', 'You can move a cell by selecting it and clicking **Cell Up** or **Cell Down** in the top toolbar.', '', 'Consecutive cells can be selected by ""lasso selection"" by dragging from outside one cell and through the group. Non-adjacent cells can be selected concurrently by clicking one and then holding down Ctrl while clicking another. Similarly, using Shift instead of Ctrl will select all intermediate cells.']'",False,4.0,True +of,Overview_of_Colaboratory_Features.ipynb,5,180,TEXT,False,"'markdown' cell: '['# Working with python', 'Colaboratory is built on top of [Jupyter Notebook](https://jupyter.org/). Below are some examples of convenience functions provided.']'",False,5.0,True +of,Overview_of_Colaboratory_Features.ipynb,6,185,TEXT,False,'markdown' cell: '['Long running python processes can be interrupted. Run the following cell and select **Runtime -> Interrupt execution** (*hotkey: Cmd/Ctrl-M I*) to stop execution.']',False,6.0,True +of,Overview_of_Colaboratory_Features.ipynb,7,161,CODE,False,"'code' cell: '['import time', 'print(""Sleeping"")', 'time.sleep(30) # sleep for a while; interrupt me!', 'print(""Done Sleeping"")']' with output: '['Sleeping\n']'",False,7.0,True +of,Overview_of_Colaboratory_Features.ipynb,8,109,TEXT,False,"'markdown' cell: '['## System aliases', '', 'Jupyter includes shortcuts for common operations, such as ls:']'",False,8.0,True +of,Overview_of_Colaboratory_Features.ipynb,9,369,CODE,False,"'code' cell: '['!ls /bin']' with output: '[""'['\t\t\t\t mknod\n"", ' 7z\t\t\t\t mktemp\n', ' 7za\t\t\t\t mm2gv\n', ' 7zr\t\t\t\t more\n', ' aclocal\t\t\t mount\n', ' aclocal-1.16\t\t\t mountpoint\n', ' acyclic\t\t\t mpexpand\n', ' add-apt-repository\t\t mpic++\n', ' addpart\t\t\t mpicc\n', ' addr2line\t\t\t mpiCC\n']'",False,9.0,True +of,Overview_of_Colaboratory_Features.ipynb,10,459,TEXT,False,"'markdown' cell: '['That `!ls` probably generated a large output. You can select the cell and clear the output by either:', '', '1. Clicking on the clear output button (x) in the toolbar above the cell; or', '2. Right clicking the left gutter of the output area and selecting ""Clear output"" from the context menu.', '', 'Execute any other process using `!` with string interpolation from python variables, and note the result can be assigned to a variable:']'",False,10.0,True +of,Overview_of_Colaboratory_Features.ipynb,11,309,TEXT,False,"'markdown' cell: '['## Magics', ""Colaboratory shares the notion of magics from Jupyter. There are shorthand annotations that change how a cell's text is executed. To learn more, see [Jupyter's magics page](http://nbviewer.jupyter.org/github/ipython/ipython/blob/1.x/examples/notebooks/Cell%20Magics.ipynb).""]'",False,11.0,True +of,Overview_of_Colaboratory_Features.ipynb,12,300,TEXT,False,"'markdown' cell: '['## Automatic completions and exploring code', '', 'Colab provides automatic completions to explore attributes of Python objects, as well as to quickly view documentation strings. As an example, first run the following cell to import the [`numpy`](http://www.numpy.org) module.']'",False,12.0,True +of,Overview_of_Colaboratory_Features.ipynb,13,37,CODE,False,'code' cell: '['import numpy as np']',False,13.0,True +of,Overview_of_Colaboratory_Features.ipynb,14,218,TEXT,False,"'markdown' cell: '['If you now insert your cursor after `np` and press **Period**(`.`), you will see the list of available completions within the `np` module. Completions can be opened again by using **Ctrl+Space**.']'",False,14.0,True +of,Overview_of_Colaboratory_Features.ipynb,15,148,TEXT,False,"'markdown' cell: '['If you type an open parenthesis after any function or class in the module, you will see a pop-up of its documentation string:']'",False,15.0,True +of,Overview_of_Colaboratory_Features.ipynb,16,359,TEXT,False,"'markdown' cell: '['The documentation can be opened again using **Ctrl+Shift+Space** or you can view the documentation for method by mouse hovering over the method name.', '', 'When hovering over the method name the `Open in tab` link will open the documentation in a persistent pane. The `View source` link will navigate to the source code for the method.']'",False,16.0,True +of,Overview_of_Colaboratory_Features.ipynb,17,46,TEXT,False,'markdown' cell: '['## Exception Formatting']',False,17.0,True +of,Overview_of_Colaboratory_Features.ipynb,18,72,TEXT,False,'markdown' cell: '['Exceptions are formatted nicely in Colab outputs:']',False,18.0,True +of,Overview_of_Colaboratory_Features.ipynb,19,109,CODE,False,"'code' cell: '['x = 1', 'y = 4', 'z = y/(1-x)']' , gives error 'ZeroDivisionError',with description 'ignored'",False,19.0,True +of,Overview_of_Colaboratory_Features.ipynb,20,165,TEXT,False,"'markdown' cell: '['## Rich, interactive outputs', 'Until now all of the generated outputs have been text, but they can be more interesting, like the chart below.']'",False,20.0,True +of,Overview_of_Colaboratory_Features.ipynb,21,613,TEXT,False,"'markdown' cell: '['# Integration with Drive', '', 'Colaboratory is integrated with Google Drive. It allows you to share, comment, and collaborate on the same document with multiple people:', '', '* The **SHARE** button (top-right of the toolbar) allows you to share the notebook and control permissions set on it.', '', '* **File->Make a Copy** creates a copy of the notebook in Drive.', '', ""* **File->Save** saves the File to Drive. **File->Save and checkpoint** pins the version so it doesn't get deleted from the revision history."", '', ""* **File->Revision history** shows the notebook's revision history.""]'",False,21.0,True +of,Overview_of_Colaboratory_Features.ipynb,22,1015,TEXT,False,"'markdown' cell: '['## Commenting on a cell', 'You can comment on a Colaboratory notebook like you would on a Google Document. Comments are attached to cells, and are displayed next to the cell they refer to. If you have **comment-only** permissions, you will see a comment button on the top right of the cell when you hover over it.', '', 'If you have edit or comment permissions you can comment on a cell in one of three ways:', '', '1. Select a cell and click the comment button in the toolbar above the top-right corner of the cell.', '1. Right click a text cell and select **Add a comment** from the context menu.', '3. Use the shortcut **Ctrl+Shift+M** to add a comment to the currently selected cell.', '', 'You can resolve and reply to comments, and you can target comments to specific collaborators by typing *+[email address]* (e.g., `+user@domain.com`). Addressed collaborators will be emailed.', '', 'The Comment button in the top-right corner of the page shows all comments attached to the notebook.']'",False,22.0,True diff --git a/test/results/notebooks.csv b/test/results/notebooks.csv new file mode 100644 index 0000000..da77fd6 --- /dev/null +++ b/test/results/notebooks.csv @@ -0,0 +1,3 @@ +filename,file_id,length,cells,code_cells,text_cells,avg_code_cell_length,avg_text_cell_length +Making_the_Most_of_your_Colab_Subscription.ipynb,the,6497,11,2,9,329.0,645.2 +Overview_of_Colaboratory_Features.ipynb,of,6888,22,4,18,169.0,341.1 diff --git a/test/submissions_manager_test.py b/test/submissions_manager_test.py new file mode 100644 index 0000000..1702ea2 --- /dev/null +++ b/test/submissions_manager_test.py @@ -0,0 +1,27 @@ + +from app.submissions_manager import SubmissionsManager + +from conftest import TEST_DOCS_DIRPATH + +def test_submissions_manager(): + + # WITHOUT STARTER FILE: + + sm = SubmissionsManager(dirpath=TEST_DOCS_DIRPATH, starter_filename=None) + assert sm.dirpath == TEST_DOCS_DIRPATH + assert len(sm.filenames) == 2 + assert len(sm.filepaths) == 2 + assert sm.starter_filepath == None + + assert sm.find_filepath("OOPS") == None + #assert sm.find_filepath("Subscription") == "test/documents/Making_the_Most_of_your_Colab_Subscription.ipynb" + assert "test/documents/Making_the_Most_of_your_Colab_Subscription.ipynb" in sm.find_filepath("Subscription") + + # WITH STARTER FILE: + + sm = SubmissionsManager(dirpath=TEST_DOCS_DIRPATH, starter_filename="Overview_of_Colaboratory_Features.ipynb") + assert sm.dirpath == TEST_DOCS_DIRPATH + assert len(sm.filenames) == 2 + assert len(sm.filepaths) == 2 + #assert sm.starter_filepath == "test/documents/Overview_of_Colaboratory_Features.ipynb" + assert "test/documents/Overview_of_Colaboratory_Features.ipynb" in sm.starter_filepath diff --git a/test/submissions_processor_test.py b/test/submissions_processor_test.py new file mode 100644 index 0000000..f8cc7ed --- /dev/null +++ b/test/submissions_processor_test.py @@ -0,0 +1,69 @@ + +from pandas import DataFrame + + +from app.submissions_processor import SubmissionsProcessor + +from conftest import TEST_DOCS_DIRPATH, TEST_RESULTS_DIRPATH + + +EXPECTED_NOTEBOOK_RECORDS = [ + { + 'filename': 'Making_the_Most_of_your_Colab_Subscription.ipynb', + 'file_id': 'the', # default id not the best for the test files. it's ok. todo: revisit + 'length': 6497, + 'cells': 11, + 'code_cells': 2, + 'text_cells': 9, + 'avg_code_cell_length': 329.0, + 'avg_text_cell_length': 645.2, + }, + { + 'filename': 'Overview_of_Colaboratory_Features.ipynb', + 'file_id': 'of', # default id not the best for the test files. it's ok. todo: revisit + 'length': 6888, + 'cells': 22, + 'code_cells': 4, + 'text_cells': 18, + 'avg_code_cell_length': 169.0, + 'avg_text_cell_length': 341.1, + + } +] + +def test_submissions_processor(): + + sp = SubmissionsProcessor(dirpath=TEST_DOCS_DIRPATH, results_dirpath=TEST_RESULTS_DIRPATH, starter_filename="Overview_of_Colaboratory_Features.ipynb") + sp.perform() + + assert isinstance(sp.notebooks_df, DataFrame) + assert sp.notebooks_df.to_dict("records") == EXPECTED_NOTEBOOK_RECORDS + + assert isinstance(sp.cells_df, DataFrame) + assert len(sp.cells_df) == 33 + assert sp.cells_df.columns.tolist() == ['file_id', 'filename', 'cell_id', 'cell_length', 'cell_type', 'is_empty', 'page_content', 'dup_content', 'starter_cell_id', 'starter_content'] + assert sp.cells_df["is_empty"].sum() == 0 # there are no blank cells in the test notebooks + assert sp.cells_df["dup_content"].sum() == 0 # there are no overlapping cells in the test notebooks + assert sp.cells_df["starter_content"].sum() == 22 + + starter_cells = sp.cells_df[sp.cells_df["filename"] == sp.starter_filename] + other_cells = sp.cells_df[sp.cells_df["filename"] != sp.starter_filename] + assert len(starter_cells) == 22 + assert len(other_cells) == 11 + + + +def test_submissions_processor_without_starter(): + + sp = SubmissionsProcessor(dirpath=TEST_DOCS_DIRPATH, results_dirpath=TEST_RESULTS_DIRPATH, starter_filename=None) + sp.perform() + + assert isinstance(sp.notebooks_df, DataFrame) + assert sp.notebooks_df.to_dict("records") == EXPECTED_NOTEBOOK_RECORDS + + assert isinstance(sp.cells_df, DataFrame) + assert len(sp.cells_df) == 33 + assert sp.cells_df.columns.tolist() == ['file_id', 'filename', 'cell_id', 'cell_length', 'cell_type', 'is_empty', 'page_content', 'dup_content', 'starter_cell_id', 'starter_content'] + assert sp.cells_df["is_empty"].sum() == 0 # there are no blank cells in the test notebooks + assert sp.cells_df["dup_content"].sum() == 0 # there are no overlapping cells in the test notebooks + assert sp.cells_df["starter_content"].sum() == 0 # no starter if we don't want it