diff --git a/scripts/gdpr/README.md b/scripts/gdpr/README.md
index ff6744b9..d8de34f4 100644
--- a/scripts/gdpr/README.md
+++ b/scripts/gdpr/README.md
@@ -1,23 +1,26 @@
-# GDPR request data
+# Uploading GDPR price data
## Context
-One of our data sources is GDPR request to supermarkets. See https://wiki.openfoodfacts.org/GDPR_request
+One of our data sources is GDPR request to supermarkets (with fidelity cards).
+
+See https://wiki.openfoodfacts.org/GDPR_request
## List of supermarkets
-|Supermarket|Data|Preprocessing|
-|-----------|---|---|
-|Auchan |1 single file||
+|Supermarket|Data |Preprocessing|
+|-----------|------------------|---|
+|Auchan |1 single file ||
|Carrefour |1 file with 2 tabs|- merge files
- skip discounts|
-|E.Leclerc |2 files|- merge files|
-|Intermarché|1 single file||
+|E.Leclerc |2 files |- merge files|
+|Intermarché|1 single file ||
+|Picard |1 file with multiple tables|- create seperate files
- merge files|
## Usage
-### Step 1: get an API token
+### Step 1: get your API token from Open Prices
-https://prices.openfoodfacts.org/api/docs#/Auth/authentication_api_v1_auth_post
+https://prices.openfoodfacts.org/api/docs#/auth/auth_create
### Step 2: upload a proof
@@ -42,7 +45,7 @@ Depending on the source, you'll need to provide the correct `LOCATION` key, and
Use the token returned in Step 1.
```
-FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python data/gdpr/create_prices_from_gdpr_csv.py
+FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
```
Last changes when you're ready:
@@ -55,12 +58,25 @@ Last changes when you're ready:
Script name: `merge_two_csv_files.csv`
+Goal: merge and enrich data from the second csv file into the first csv file.
+
+#### E.Leclerc
+
E.Leclerc returns 2 different files, one containing a list of receipts (with dates & locations), and the other a list of products with their receipt id. So we need to first merge the 2 files into 1.
```
(TODO)
```
+#### Carrefour
+
For Carrefour, the file contains 2 tabs, 1 called "Tickets" and the other called "Remise".
```
-FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python data/gdpr/merge_two_csv_files.py
+FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME_LIST="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python scripts/gdpr/merge_two_csv_files.py
+```
+
+#### Picard
+
+Picard returns 1 spreadsheet containing multiple tables. We first need to store the Product table & the Tickets table in 2 seperate csv files.
+```
+FILEPATH_1=Picard_Produits.csv FILEPATH_2=Picard_Tickets.csv PIVOT_FIELD_NAME_LIST="NUMERO DE TICKET" EXCLUDE_FIELD_NAME_LIST="PRIX TTC" poetry run python scripts/gdpr/merge_two_csv_files.py
```
diff --git a/scripts/gdpr/create_prices_from_gdpr_csv.py b/scripts/gdpr/create_prices_from_gdpr_csv.py
index 5b762e39..5a63abd6 100644
--- a/scripts/gdpr/create_prices_from_gdpr_csv.py
+++ b/scripts/gdpr/create_prices_from_gdpr_csv.py
@@ -5,10 +5,12 @@
import time
import requests
+from utils import get_picard_product_from_subcode
OPEN_PRICES_CREATE_PRICE_ENDPOINT = f'{os.environ.get("API_ENDPOINT")}/prices'
OPEN_PRICES_TOKEN = os.environ.get("API_TOKEN")
-GDPR_FIELD_MAPPING_FILEPATH = "data/gdpr/gdpr_field_mapping.csv"
+
+GDPR_FIELD_MAPPING_FILEPATH = "scripts/gdpr/gdpr_field_mapping.csv"
DEFAULT_PRICE_CURRENCY = "EUR"
PRICE_FIELDS = [
@@ -44,10 +46,14 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
# remove any whitespace
gdpr_field_value = gdpr_field_value.strip()
- # shop specific rules
- if gdpr_source == "AUCHAN":
- if op_field == "price":
+ # field-specific rules
+ if op_field in ["price", "quantity"]:
+ if gdpr_field_value:
gdpr_field_value = float(gdpr_field_value.replace(",", "."))
+
+ # shop-specific rules
+ if gdpr_source == "AUCHAN":
+ pass
elif gdpr_source == "CARREFOUR":
# input: |3178050000749|
# output: 3178050000749
@@ -62,15 +68,18 @@ def gdpr_source_field_cleanup_rules(gdpr_source, op_field, gdpr_field_value):
elif gdpr_source == "ELECLERC":
pass
elif gdpr_source == "INTERMARCHE":
- if op_field in ["price", "quantity"]:
- # divide price by quantity
- gdpr_field_value = float(gdpr_field_value.replace(",", "."))
# input: 27/05/2021
# output: 2021-05-27
if op_field == "date":
gdpr_field_value = datetime.datetime.strptime(
gdpr_field_value, "%d/%m/%Y"
).strftime("%Y-%m-%d")
+ elif gdpr_source == "PICARD":
+ # Picard codes are a subset of the EAN codes
+ # They have a length of 5 (4 if missing leading 0)
+ if op_field == "product_code":
+ if len(gdpr_field_value) == 4:
+ gdpr_field_value = f"0{gdpr_field_value}"
return gdpr_field_value
@@ -79,15 +88,15 @@ def gdpr_source_price_cleanup_rules(gdpr_source, gdpr_op_price):
"""
Rules to cleanup the price object
"""
- if gdpr_source == "AUCHAN":
- pass
- elif gdpr_source == "CARREFOUR":
- pass
- elif gdpr_source == "ELECLERC":
- pass
- elif gdpr_source == "INTERMARCHE":
- # price must be divided by quantity
- gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
+ # price must be divided by quantity
+ if "quantity" in gdpr_op_price:
+ if gdpr_op_price["quantity"]:
+ gdpr_op_price["price"] = gdpr_op_price["price"] / gdpr_op_price["quantity"]
+
+ # discount boolean flag
+ if "discount" in gdpr_op_price:
+ if gdpr_op_price["discount"]:
+ gdpr_op_price["price_is_discounted"] = True
return gdpr_op_price
@@ -135,6 +144,12 @@ def gdpr_source_filter_rules(op_price_list, gdpr_source=""):
passes_test = False
elif gdpr_source == "INTERMARCHE":
pass
+ elif gdpr_source == "PICARD":
+ full_product_code = get_picard_product_from_subcode(op_price)
+ if full_product_code:
+ op_price["product_code"] = full_product_code
+ else:
+ passes_test = False
if passes_test:
op_price_list_filtered.append(op_price)
@@ -219,7 +234,7 @@ def create_price(price):
if __name__ == "__main__":
"""
How-to run:
- > FILEPATH= poetry run python data/gdpr/create_prices_from_gdpr_csv.py
+ > FILEPATH= poetry run python scripts/gdpr/create_prices_from_gdpr_csv.py
Required params: see REQUIRED_ENV_PARAMS
"""
# Step 1: read input file
@@ -256,21 +271,21 @@ def create_price(price):
)
print(len(open_prices_price_list))
- # Step 4a: filter prices depending on specific source rules
- print("===== Applying source filtering rules")
- open_prices_price_list_filtered_1 = gdpr_source_filter_rules(
- open_prices_price_list, gdpr_source=source
+ # Step 4a: filter prices depending on location
+ print("===== Applying location filtering rules")
+ open_prices_price_list_filtered_1 = gdpr_source_location_rules(
+ open_prices_price_list
)
print(len(open_prices_price_list_filtered_1))
- # Step 4b: filter prices depending on location
- print("===== Applying location filtering rules")
- open_prices_price_list_filtered_2 = gdpr_source_location_rules(
- open_prices_price_list_filtered_1
+ # Step 4b: filter prices depending on specific source rules
+ print("===== Applying source filtering rules")
+ open_prices_price_list_filtered_2 = gdpr_source_filter_rules(
+ open_prices_price_list_filtered_1, gdpr_source=source
)
print(len(open_prices_price_list_filtered_2))
- print("===== Output example (extra fields will be ignored):")
+ print("===== Output example (extra fields will be ignored)")
print(open_prices_price_list_filtered_2[0])
# Step 5: send prices to backend via API
diff --git a/scripts/gdpr/gdpr_field_mapping.csv b/scripts/gdpr/gdpr_field_mapping.csv
index 3658d951..0041e8bc 100644
--- a/scripts/gdpr/gdpr_field_mapping.csv
+++ b/scripts/gdpr/gdpr_field_mapping.csv
@@ -1,8 +1,8 @@
-OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT
-product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD
-product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0
-price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points
-discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,,
-quantity,,,Quantité,,,,Qte Vendues,
-date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY
-location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,
+OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,ELECLERC_FIELD,ELECLERC_COMMENT,INTERMARCHE_FIELD,INTERMARCHE_COMMENT,PICARD_FIELD,PICARD_COMMENT
+product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD,CODE PRODUIT,a 5-number code. need to do an extra API search to find the corresponding product
+product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0,LIBELLE ARTICLE,
+price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points,PRIX TTC,has commas instead of points
+discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS, les bons d'achats… appliqués lors du passage en caisse)",,,,,,IDENTIFIANT REMISE,a string ID to another table
+quantity,,,Quantité,,,,Qte Vendues,,NOMBRE UNITES,
+date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY,DATE TICKET,
+location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,,NOM DU MAGASIN,
diff --git a/scripts/gdpr/merge_two_csv_files.py b/scripts/gdpr/merge_two_csv_files.py
index 5dfcbdfd..e6322ec5 100644
--- a/scripts/gdpr/merge_two_csv_files.py
+++ b/scripts/gdpr/merge_two_csv_files.py
@@ -13,20 +13,37 @@ def read_csv(filepath):
return data
-def merge_data_of_two_lists(list_1, list_2, pivot_list=["ticket"]):
- print(pivot_list)
+def merge_data_of_two_lists(
+ list_1, list_2, pivot_field_name_list=["ticket"], exclude_field_name_list=[]
+):
data_merged = list()
for row_1 in list_1:
row_2 = None
+ # find corresponding row in list_2
for row in list_2:
- if all(row_1[pivot] == row[pivot] for pivot in pivot_list):
+ if all(
+ row_1[pivot_field_name] == row[pivot_field_name]
+ for pivot_field_name in pivot_field_name_list
+ ):
row_2 = row
if not row_2:
row_2 = {
- **{key: row_1[key] for key in list_2[0].keys() if key in pivot_list},
- **{key: "" for key in list_2[0].keys() if key not in pivot_list},
+ **{
+ key: row_1[key]
+ for key in list_2[0].keys()
+ if key in pivot_field_name_list
+ },
+ **{
+ key: ""
+ for key in list_2[0].keys()
+ if key not in pivot_field_name_list
+ },
}
+ # cleanup row_2
+ for exclude_field_name in exclude_field_name_list:
+ row_2.pop(exclude_field_name, None)
+ # merge
data_merged.append({**row_1, **row_2})
return data_merged
@@ -44,12 +61,14 @@ def write_csv(data, filepath):
if __name__ == "__main__":
"""
How-to run:
- > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME= poetry run python data/gdpr/merge_two_csv_files.py # noqa
+ > FILEPATH_1= FILEPATH_2= PIVOT_FIELD_NAME_LIST= EXCLUDE_FIELD_NAME_LIST= poetry run python scripts/gdpr/merge_two_csv_files.py # noqa
"""
filepath_1 = os.environ.get("FILEPATH_1")
filepath_2 = os.environ.get("FILEPATH_2")
- pivot_field_name = os.environ.get("PIVOT_FIELD_NAME")
- pivot_field_name_list = pivot_field_name.split(",")
+ pivot_field_name_str = os.environ.get("PIVOT_FIELD_NAME_LIST")
+ pivot_field_name_list = pivot_field_name_str.split(",")
+ exclude_field_name_str = os.environ.get("EXCLUDE_FIELD_NAME_LIST")
+ exclude_field_name_list = exclude_field_name_str.split(",")
output_filepath = filepath_1.split(".csv")[0] + "_merged.csv"
print(f"Step 1: reading {filepath_1}")
@@ -60,9 +79,14 @@ def write_csv(data, filepath):
data_2 = read_csv(filepath_2)
print(f"{len(data_2)} lines")
- print(f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list}")
+ print(
+ f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list} (and excluding: {exclude_field_name_list})"
+ )
data_merged = merge_data_of_two_lists(
- data_1, data_2, pivot_list=pivot_field_name_list
+ data_1,
+ data_2,
+ pivot_field_name_list=pivot_field_name_list,
+ exclude_field_name_list=exclude_field_name_list,
)
print(f"{len(data_merged)} lines")
diff --git a/scripts/gdpr/utils.py b/scripts/gdpr/utils.py
new file mode 100644
index 00000000..9c9a08d9
--- /dev/null
+++ b/scripts/gdpr/utils.py
@@ -0,0 +1,68 @@
+import requests
+
+OFF_SEARCHLICIOUS_API_ENDPOINT = "https://search.openfoodfacts.org/search"
+PICARD_GS1_PREFIX = "327016"
+
+
+def get_picard_product_from_subcode(op_price_dict):
+ # the Picard product_code is incomplete
+ # use Search-a-licious API to get the full product code
+ # if needed, prompt the user to select the correct one
+ passes_test = True
+ full_product_code = None
+
+ print(
+ "----- Input:",
+ op_price_dict["product_code"],
+ op_price_dict["product_name"],
+ op_price_dict["price"],
+ )
+ for q_index, q_params in enumerate(
+ [
+ f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}? brands:picard",
+ f"code:{PICARD_GS1_PREFIX}?{op_price_dict['product_code']}?",
+ f"code:*{op_price_dict['product_code']}? brands:picard",
+ f"code:*{op_price_dict['product_code']}?&page_size=50",
+ ]
+ ):
+ response = requests.get(
+ OFF_SEARCHLICIOUS_API_ENDPOINT,
+ params={"q": q_params},
+ )
+ print(response.url)
+ if response.status_code == 200:
+ response_product_count = response.json()["count"]
+ print("Products found:", response_product_count)
+ if response_product_count:
+ # confidence strong enough: take the first product
+ if (q_index < 2) and (response_product_count == 1):
+ full_product_code = response.json()["hits"][0]["code"]
+ else:
+ # multiple results: prompt the user to select
+ response_product_list = response.json()["hits"]
+ for index, response_product in enumerate(response_product_list):
+ print(
+ index + 1,
+ ":",
+ response_product.get("code"),
+ response_product.get("product_name", ""),
+ response_product.get("brands_tags", ""),
+ response_product.get("stores", ""),
+ )
+ user_choice_number_str = input(
+ "Which product ? Type 0 to skip. Or provide the correct code. "
+ )
+ if len(user_choice_number_str) == 1:
+ full_product_code = response_product_list[
+ int(user_choice_number_str) - 1
+ ]["code"]
+ print("Chosen product code:", full_product_code)
+ elif 3 < len(user_choice_number_str) <= 13:
+ full_product_code = user_choice_number_str
+ print("Chosen product code:", full_product_code)
+ else:
+ print("Product not found...")
+ passes_test = False
+ break
+
+ return passes_test, full_product_code