From 955ceccdce65b147189c9d1421d02fd0bf6577e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Odini?= Date: Thu, 12 Sep 2024 14:34:01 +0200 Subject: [PATCH] feat(GDPR): improve script to manage Picard (#430) --- scripts/gdpr/README.md | 38 ++++++++---- scripts/gdpr/create_prices_from_gdpr_csv.py | 67 ++++++++++++-------- scripts/gdpr/gdpr_field_mapping.csv | 16 ++--- scripts/gdpr/merge_two_csv_files.py | 44 ++++++++++--- scripts/gdpr/utils.py | 68 +++++++++++++++++++++ 5 files changed, 178 insertions(+), 55 deletions(-) create mode 100644 scripts/gdpr/utils.py diff --git a/scripts/gdpr/README.md b/scripts/gdpr/README.md index ff6744b9..d8de34f4 100644 --- a/scripts/gdpr/README.md +++ b/scripts/gdpr/README.md @@ -1,23 +1,26 @@ -# GDPR request data +# Uploading GDPR price data ## Context -One of our data sources is GDPR request to supermarkets. See https://wiki.openfoodfacts.org/GDPR_request +One of our data sources is GDPR request to supermarkets (with fidelity cards). + +See https://wiki.openfoodfacts.org/GDPR_request ## List of supermarkets -|Supermarket|Data|Preprocessing| -|-----------|---|---| -|Auchan |1 single file|| +|Supermarket|Data |Preprocessing| +|-----------|------------------|---| +|Auchan |1 single file || |Carrefour |1 file with 2 tabs|- merge files
- skip discounts| -|E.Leclerc |2 files|- merge files| -|Intermarché|1 single file|| +|E.Leclerc |2 files |- merge files| +|Intermarché|1 single file || +|Picard |1 file with multiple tables|- create seperate files
- merge files| ## Usage -### Step 1: get an API token +### Step 1: get your API token from Open Prices -https://prices.openfoodfacts.org/api/docs#/Auth/authentication_api_v1_auth_post +https://prices.openfoodfacts.org/api/docs#/auth/auth_create ### Step 2: upload a proof @@ -42,7 +45,7 @@ Depending on the source, you'll need to provide the correct `LOCATION` key, and Use the token returned in Step 1. ``` -FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python data/gdpr/create_prices_from_gdpr_csv.py +FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py ``` Last changes when you're ready: @@ -55,12 +58,25 @@ Last changes when you're ready: Script name: `merge_two_csv_files.csv` +Goal: merge and enrich data from the second csv file into the first csv file. + +#### E.Leclerc + E.Leclerc returns 2 different files, one containing a list of receipts (with dates & locations), and the other a list of products with their receipt id. So we need to first merge the 2 files into 1. ``` (TODO) ``` +#### Carrefour + For Carrefour, the file contains 2 tabs, 1 called "Tickets" and the other called "Remise". ``` -FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python data/gdpr/merge_two_csv_files.py +FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME_LIST="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python scripts/gdpr/merge_two_csv_files.py +``` + +#### Picard + +Picard returns 1 spreadsheet containing multiple tables. We first need to store the Product table & the Tickets table in 2 seperate csv files. +``` +FILEPATH_1=Picard_Produits.csv FILEPATH_2=Picard_Tickets.csv PIVOT_FIELD_NAME_LIST="NUMERO DE TICKET" EXCLUDE_FIELD_NAME_LIST="PRIX TTC" poetry run python scripts/gdpr/merge_two_csv_files.py ``` diff --git a/scripts/gdpr/create_prices_from_gdpr_csv.py b/scripts/gdpr/create_prices_from_gdpr_csv.py index 5b762e39..5a63abd6 100644 --- a/scripts/gdpr/create_prices_from_gdpr_csv.py +++ b/scripts/gdpr/create_prices_from_gdpr_csv.py @@ -5,10 +5,12 @@ import time import requests +from utils import get_picard_product_from_subcode OPEN_PRICES_CREATE_PRICE_ENDPOINT = f'{os.environ.get("API_ENDPOINT")}/prices' OPEN_PRICES_TOKEN = os.environ.get("API_TOKEN") -GDPR_FIELD_MAPPING_FILEPATH = "data/gdpr/gdpr_field_mapping.csv" + +GDPR_FIELD_MAPPING_FILEPATH = "scripts/gdpr/gdpr_field_mapping.csv" DEFAULT_PRICE_CURRENCY = "EUR" PRICE_FIELDS = [ @@ -44,10 +46,14 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value): # remove any whitespace gdpr_field_value = gdpr_field_value.strip() - # shop specific rules - if gdpr_source == "AUCHAN": - if op_field == "price": + # field-specific rules + if op_field in ["price", "quantity"]: + if gdpr_field_value: gdpr_field_value = float(gdpr_field_value.replace(",", ".")) + + # shop-specific rules + if gdpr_source == "AUCHAN": + pass elif gdpr_source == "CARREFOUR": # input: |3178050000749| # output: 3178050000749 @@ -62,15 +68,18 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value): elif gdpr_source == "ELECLERC": pass elif gdpr_source == "INTERMARCHE": - if op_field in ["price", "quantity"]: - # divide price by quantity - gdpr_field_value = float(gdpr_field_value.replace(",", ".")) # input: 27/05/2021 # output: 2021-05-27 if op_field == "date": gdpr_field_value = datetime.datetime.strptime( gdpr_field_value, "%d/%m/%Y" ).strftime("%Y-%m-%d") + elif gdpr_source == "PICARD": + # Picard codes are a subset of the EAN codes + # They have a length of 5 (4 if missing leading 0) + if op_field == "product_code": + if len(gdpr_field_value) == 4: + gdpr_field_value = f"0{gdpr_field_value}" return gdpr_field_value @@ -79,15 +88,15 @@ def gdpr_source_price_cleanup_rules(gdpr_source, gdpr_op_price): """ Rules to cleanup the price object """ - if gdpr_source == "AUCHAN": - pass - elif gdpr_source == "CARREFOUR": - pass - elif gdpr_source == "ELECLERC": - pass - elif gdpr_source == "INTERMARCHE": - # price must be divided by quantity - gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"] + # price must be divided by quantity + if "quantity" in gdpr_op_price: + if gdpr_op_price["quantity"]: + gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"] + + # discount boolean flag + if "discount" in gdpr_op_price: + if gdpr_op_price["discount"]: + gdpr_op_price["price_is_discounted"] = True return gdpr_op_price @@ -135,6 +144,12 @@ def gdpr_source_filter_rules(op_price_list, gdpr_source=""): passes_test = False elif gdpr_source == "INTERMARCHE": pass + elif gdpr_source == "PICARD": + full_product_code = get_picard_product_from_subcode(op_price) + if full_product_code: + op_price["product_code"] = full_product_code + else: + passes_test = False if passes_test: op_price_list_filtered.append(op_price) @@ -219,7 +234,7 @@ def create_price(price): if __name__ == "__main__": """ How-to run: - > FILEPATH= poetry run python data/gdpr/create_prices_from_gdpr_csv.py + > FILEPATH= poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py Required params: see REQUIRED_ENV_PARAMS """ # Step 1: read input file @@ -256,21 +271,21 @@ def create_price(price): ) print(len(open_prices_price_list)) - # Step 4a: filter prices depending on specific source rules - print("===== Applying source filtering rules") - open_prices_price_list_filtered_1 = gdpr_source_filter_rules( - open_prices_price_list, gdpr_source=source + # Step 4a: filter prices depending on location + print("===== Applying location filtering rules") + open_prices_price_list_filtered_1 = gdpr_source_location_rules( + open_prices_price_list ) print(len(open_prices_price_list_filtered_1)) - # Step 4b: filter prices depending on location - print("===== Applying location filtering rules") - open_prices_price_list_filtered_2 = gdpr_source_location_rules( - open_prices_price_list_filtered_1 + # Step 4b: filter prices depending on specific source rules + print("===== Applying source filtering rules") + open_prices_price_list_filtered_2 = gdpr_source_filter_rules( + open_prices_price_list_filtered_1, gdpr_source=source ) print(len(open_prices_price_list_filtered_2)) - print("===== Output example (extra fields will be ignored):") + print("===== Output example (extra fields will be ignored)") print(open_prices_price_list_filtered_2[0]) # Step 5: send prices to backend via API diff --git a/scripts/gdpr/gdpr_field_mapping.csv b/scripts/gdpr/gdpr_field_mapping.csv index 3658d951..0041e8bc 100644 --- a/scripts/gdpr/gdpr_field_mapping.csv +++ b/scripts/gdpr/gdpr_field_mapping.csv @@ -1,8 +1,8 @@ -OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT -product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD -product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0 -price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points -discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,, -quantity,,,Quantité,,,,Qte Vendues, -date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY -location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM, +OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT,PICARD_FIELD,PICARD_COMMENT +product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD,CODE PRODUIT,a 5-number code. need to do an extra API search to find the corresponding product +product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0,LIBELLE ARTICLE, +price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points,PRIX TTC,has commas instead of points +discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,,,IDENTIFIANT REMISE,a string ID to another table +quantity,,,Quantité,,,,Qte Vendues,,NOMBRE UNITES, +date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY,DATE TICKET, +location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,,NOM DU MAGASIN, diff --git a/scripts/gdpr/merge_two_csv_files.py b/scripts/gdpr/merge_two_csv_files.py index 5dfcbdfd..e6322ec5 100644 --- a/scripts/gdpr/merge_two_csv_files.py +++ b/scripts/gdpr/merge_two_csv_files.py @@ -13,20 +13,37 @@ def read_csv(filepath): return data -def merge_data_of_two_lists(list_1, list_2, pivot_list=["ticket"]): - print(pivot_list) +def merge_data_of_two_lists( + list_1, list_2, pivot_field_name_list=["ticket"], exclude_field_name_list=[] +): data_merged = list() for row_1 in list_1: row_2 = None + # find corresponding row in list_2 for row in list_2: - if all(row_1[pivot] == row[pivot] for pivot in pivot_list): + if all( + row_1[pivot_field_name] == row[pivot_field_name] + for pivot_field_name in pivot_field_name_list + ): row_2 = row if not row_2: row_2 = { - **{key: row_1[key] for key in list_2[0].keys() if key in pivot_list}, - **{key: "" for key in list_2[0].keys() if key not in pivot_list}, + **{ + key: row_1[key] + for key in list_2[0].keys() + if key in pivot_field_name_list + }, + **{ + key: "" + for key in list_2[0].keys() + if key not in pivot_field_name_list + }, } + # cleanup row_2 + for exclude_field_name in exclude_field_name_list: + row_2.pop(exclude_field_name, None) + # merge data_merged.append({**row_1, **row_2}) return data_merged @@ -44,12 +61,14 @@ def write_csv(data, filepath): if __name__ == "__main__": """ How-to run: - > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME= poetry run python data/gdpr/merge_two_csv_files.py # noqa + > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME_LIST= EXCLUDE_FIELD_NAME_LIST= poetry run python scripts/gdpr/merge_two_csv_files.py # noqa """ filepath_1 = os.environ.get("FILEPATH_1") filepath_2 = os.environ.get("FILEPATH_2") - pivot_field_name = os.environ.get("PIVOT_FIELD_NAME") - pivot_field_name_list = pivot_field_name.split(",") + pivot_field_name_str = os.environ.get("PIVOT_FIELD_NAME_LIST") + pivot_field_name_list = pivot_field_name_str.split(",") + exclude_field_name_str = os.environ.get("EXCLUDE_FIELD_NAME_LIST") + exclude_field_name_list = exclude_field_name_str.split(",") output_filepath = filepath_1.split(".csv")[0] + "_merged.csv" print(f"Step 1: reading {filepath_1}") @@ -60,9 +79,14 @@ def write_csv(data, filepath): data_2 = read_csv(filepath_2) print(f"{len(data_2)} lines") - print(f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list}") + print( + f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list} (and excluding: {exclude_field_name_list})" + ) data_merged = merge_data_of_two_lists( - data_1, data_2, pivot_list=pivot_field_name_list + data_1, + data_2, + pivot_field_name_list=pivot_field_name_list, + exclude_field_name_list=exclude_field_name_list, ) print(f"{len(data_merged)} lines") diff --git a/scripts/gdpr/utils.py b/scripts/gdpr/utils.py new file mode 100644 index 00000000..9c9a08d9 --- /dev/null +++ b/scripts/gdpr/utils.py @@ -0,0 +1,68 @@ +import requests + +OFF_SEARCHLICIOUS_API_ENDPOINT = "https://search.openfoodfacts.org/search" +PICARD_GS1_PREFIX = "327016" + + +def get_picard_product_from_subcode(op_price_dict): + # the Picard product_code is incomplete + # use Search-a-licious API to get the full product code + # if needed, prompt the user to select the correct one + passes_test = True + full_product_code = None + + print( + "----- Input:", + op_price_dict["product_code"], + op_price_dict["product_name"], + op_price_dict["price"], + ) + for q_index, q_params in enumerate( + [ + f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}? brands:picard", + f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}?", + f"code:*{op_price_dict['product_code']}? brands:picard", + f"code:*{op_price_dict['product_code']}?&page_size=50", + ] + ): + response = requests.get( + OFF_SEARCHLICIOUS_API_ENDPOINT, + params={"q": q_params}, + ) + print(response.url) + if response.status_code == 200: + response_product_count = response.json()["count"] + print("Products found:", response_product_count) + if response_product_count: + # confidence strong enough: take the first product + if (q_index < 2) and (response_product_count == 1): + full_product_code = response.json()["hits"][0]["code"] + else: + # multiple results: prompt the user to select + response_product_list = response.json()["hits"] + for index, response_product in enumerate(response_product_list): + print( + index + 1, + ":", + response_product.get("code"), + response_product.get("product_name", ""), + response_product.get("brands_tags", ""), + response_product.get("stores", ""), + ) + user_choice_number_str = input( + "Which product ? Type 0 to skip. Or provide the correct code. " + ) + if len(user_choice_number_str) == 1: + full_product_code = response_product_list[ + int(user_choice_number_str) - 1 + ]["code"] + print("Chosen product code:", full_product_code) + elif 3 < len(user_choice_number_str) <= 13: + full_product_code = user_choice_number_str + print("Chosen product code:", full_product_code) + else: + print("Product not found...") + passes_test = False + break + + return passes_test, full_product_code