Skip to content

Commit

Permalink
latest
Browse files Browse the repository at this point in the history
  • Loading branch information
gleasonw committed Oct 17, 2024
1 parent 1eb1456 commit 9b57145
Show file tree
Hide file tree
Showing 14 changed files with 298 additions and 36 deletions.
8 changes: 4 additions & 4 deletions app/gallicagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,13 @@ async def do_dataframe_fetch(url: str, params: Dict):
def transform_series(series_dataframe: pd.DataFrame, input: GallicagramInput):
if input.grouping == "mois" and input.source != "livres":
series_dataframe = (
series_dataframe.groupby(["annee", "mois", "gram"])
series_dataframe.groupby(["annee", "mois"])
.agg({"n": "sum", "total": "sum"})
.reset_index()
)
if input.grouping == "annee":
series_dataframe = (
series_dataframe.groupby(["annee", "gram"])
series_dataframe.groupby(["annee"])
.agg({"n": "sum", "total": "sum"})
.reset_index()
)
Expand All @@ -99,12 +99,12 @@ def calc_ratio(row):
if all(series_dataframe.ratio == 0):
raise HTTPException(status_code=404, detail="No occurrences of the term found")

def get_unix_timestamp(row) -> int:
def get_unix_timestamp(row) -> float:
year = int(row.get("annee", 0))
month = int(row.get("mois", 1))

dt = datetime(year, month, 1)
return int(dt.timestamp() * 1000)
return dt.timestamp() * 1000

return series_dataframe.apply(
lambda row: (get_unix_timestamp(row), row["ratio"]), axis=1
Expand Down
71 changes: 62 additions & 9 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import asyncio
from contextlib import asynccontextmanager
from io import StringIO
import os
import time
import aiohttp.client_exceptions
from bs4 import BeautifulSoup, ResultSet
import uvicorn
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypedDict
from app.context import Context, HTMLContext
from app.contextSnippets import (
ContextSnippetQuery,
Expand Down Expand Up @@ -38,7 +36,6 @@
from app.mostFrequent import get_sample_text
from app.volumeOccurrence import VolumeOccurrence, VolumeRecord
from pydantic import BaseModel
import pandas as pd
from datetime import datetime
import logfire

Expand Down Expand Up @@ -423,6 +420,59 @@ async def sample(
return sample_text.read()


@app.get("/api/occurrences_no_context")
async def fetch_occurrences_no_context(
args: ContextSearchArgs = Depends(sru_params),
session: aiohttp.ClientSession = Depends(session),
) -> SRUResponse:
total_records = 0
origin_urls = []

def set_total_records(num_records: int):
nonlocal total_records
total_records = num_records

def set_origin_urls(urls: List[str]):
nonlocal origin_urls
origin_urls = urls

records = await get_documents_with_occurrences(
args=args,
on_get_total_records=set_total_records,
on_get_origin_urls=set_origin_urls,
session=session,
)

return SRUResponse(
records=records,
total_records=total_records,
origin_urls=origin_urls,
)


@app.get("/api/context")
async def context(
ark: str = Query(),
terms: List[str] = Query(),
url: str = Query(),
session: aiohttp.ClientSession = Depends(session),
) -> List[ContextRow]:
context = Context.get(
queries=[ContentQuery(ark=ark, terms=terms)],
session=session,
)
context_unwrapped = [record async for record in context]
result = context_unwrapped[0]
if result is None:
raise HTTPException(status_code=404, detail="No results found")
return [
row
for row in build_row_record_from_ContentSearch_response(
record={"url": url, "terms": terms}, context=result
)
]


@app.get("/api/gallicaRecords")
async def fetch_records_from_gallica(
args: ContextSearchArgs = Depends(sru_params),
Expand Down Expand Up @@ -615,19 +665,22 @@ def stringify_and_split(span: BeautifulSoup):
return rows


def build_row_record_from_ContentSearch_response(
record: VolumeRecord, context: HTMLContext
):
class Record(TypedDict):
url: str
terms: List[str]


def build_row_record_from_ContentSearch_response(record: Record, context: HTMLContext):
for page in context.pages:
soup = BeautifulSoup(page.context, "html.parser")
spans = soup.find_all("span", {"class": "highlight"})
if spans:
page_rows = parse_spans_to_rows(
spans=spans,
terms=record.terms,
terms=record["terms"],
)
for row in page_rows:
row.page_url = f"{record.url}/f{page.page_num}.image.r={row.pivot}"
row.page_url = f"{record['url']}/f{page.page_num}.image.r={row.pivot}"
row.page_num = page.page_num
yield row

Expand Down
21 changes: 20 additions & 1 deletion app/utils/parse_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
from typing import List, Tuple
import re

# TODO: refactor as a dataclass
namespaces = {
"srw": "http://www.loc.gov/zing/srw/",
"oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
"dc": "http://purl.org/dc/elements/1.1/",
# Add any other namespaces you might need
}


def get_one_paper_from_record_batch(xml: bytes) -> str:
Expand Down Expand Up @@ -107,6 +112,20 @@ def get_url_from_record(record) -> str:
return ""


def get_uri_from_record_xml(record) -> str:
try:
extra_record_element = record.find("srw:extraRecordData", namespaces=namespaces)

uri = extra_record_element.findtext("uri")

if uri is not None:
return uri
return ""
except etree.XMLSyntaxError as e:
print(f"XML Syntax Error: {e}")
return ""


def get_paper_title_from_record_xml(record) -> str:
xml = get_data_from_record_root(record)
paper_title = xml.find("{http://purl.org/dc/elements/1.1/}title").text
Expand Down
23 changes: 5 additions & 18 deletions app/volumeOccurrence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import urllib.parse

import aiohttp
from pydantic import BaseModel
from app.fetch import fetch_queries_concurrently
from app.queries import VolumeQuery

Expand All @@ -17,6 +18,7 @@
get_paper_title_from_record_xml,
get_paper_code_from_record_xml,
get_date_from_record_xml,
get_uri_from_record_xml,
get_url_from_record,
get_num_records_from_gallica_xml,
get_publisher_from_record_xml,
Expand All @@ -27,33 +29,17 @@
from app.models import OccurrenceArgs


@dataclass(frozen=True, slots=True)
class VolumeRecord:
class VolumeRecord(BaseModel):
paper_title: str
paper_code: str
ocr_quality: float
author: str
url: str
date: str
ark: str
terms: List[str]
publisher: Optional[str] = None

@property
def ark(self) -> str:
return self.url.split("/")[-1]

def dict(self):
return {
"paper_title": self.paper_title,
"paper_code": self.paper_code,
"ocr_quality": self.ocr_quality,
"author": self.author,
"url": self.url,
"date": str(self.date),
"terms": self.terms,
"ark": self.ark,
}


class VolumeOccurrence:
"""Fetches occurrence metadata from Gallica's SRU API. There may be many occurrences in one Gallica record."""
Expand Down Expand Up @@ -133,6 +119,7 @@ def parse(gallica_responses: Any, on_get_total_records=None):
paper_title=get_paper_title_from_record_xml(record),
paper_code=get_paper_code_from_record_xml(record),
date=get_date_from_record_xml(record),
ark=get_uri_from_record_xml(record),
url=get_url_from_record(record),
author=get_author_from_record_xml(record),
publisher=get_publisher_from_record_xml(record),
Expand Down
2 changes: 1 addition & 1 deletion gallica-getter-bruno/collection.bru
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
vars:pre-request {
baseUrl: https://gallica-getter-little-snow-3158.fly.dev
baseUrl: http://localhost:8000
}
4 changes: 2 additions & 2 deletions gallica-getter-bruno/direct-gallica.bru
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ meta {
}

get {
url: https://gallica.bnf.fr/SRU?operation=searchRetrieve&exactSearch=True&version=1.2&startRecord=0&maximumRecords=10&collapsing=false&query=%28text+adj+%22brazza%22%29+and+dc.type+all+%22fascicule%22+or+dc.type+all+%22monographie%22
url: https://gallica.bnf.fr/SRU?operation=searchRetrieve&exactSearch=True&version=1.2&startRecord=0&maximumRecords=10&collapsing=false&query=(text adj "los angeles") and dc.type all "fascicule" or dc.type all "monographie"
body: none
auth: none
}
Expand All @@ -17,5 +17,5 @@ params:query {
startRecord: 0
maximumRecords: 10
collapsing: false
query: (text adj "brazza") and dc.type all "fascicule" or dc.type all "monographie"
query: (text adj "los angeles") and dc.type all "fascicule" or dc.type all "monographie"
}
5 changes: 4 additions & 1 deletion gallica-getter-bruno/gallicaRecords.bru
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@ meta {
}

get {
url: {{baseUrl}}/api/gallicaRecords?terms=brazza
url: {{baseUrl}}/api/gallicaRecords?terms=brazza&start_date=1789&end_date=1945&limit=10
body: none
auth: none
}

params:query {
terms: brazza
start_date: 1789
end_date: 1945
limit: 10
}
11 changes: 11 additions & 0 deletions gallica-getter-bruno/golang server xml test.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
meta {
name: golang server xml test
type: http
seq: 6
}

get {
url: http://localhost:8888/testfetch
body: none
auth: none
}
15 changes: 15 additions & 0 deletions gallica-getter-bruno/occurrences_no_context.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
meta {
name: occurrences_no_context
type: http
seq: 8
}

get {
url: {{baseUrl}}/api/occurences_no_context?terms=brazza
body: none
auth: none
}

params:query {
terms: brazza
}
15 changes: 15 additions & 0 deletions gallica-getter-bruno/series.bru
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
meta {
name: series
type: http
seq: 7
}

get {
url: {{baseUrl}}/api/series?term=brazza
body: none
auth: none
}

params:query {
term: brazza
}
Loading

0 comments on commit 9b57145

Please sign in to comment.