refactor(GDPR): add README, improve Carrefour mgmt. ref #213

openfoodfacts · May 3, 2024 · e6f667d · e6f667d
1 parent 69752a2
commit e6f667d
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 10 deletions.
diff --git a/data/gdpr/README.md b/data/gdpr/README.md
@@ -0,0 +1,66 @@
+# GDPR request data
+
+## Context
+
+One of our data sources is GDPR request to supermarkets. See https://wiki.openfoodfacts.org/GDPR_request
+
+## List of supermarkets
+
+|Supermarket|Data|Preprocessing|
+|-----------|---|---|
+|Auchan     |1 single file||
+|Carrefour  |1 file with 2 tabs|- merge files<br/>- skip discounts|
+|E.Leclerc  |2 files|- merge files|
+|Intermarché|1 single file||
+
+## Usage
+
+### Step 1: get an API token
+
+https://prices.openfoodfacts.org/api/docs#/Auth/authentication_api_v1_auth_post
+
+### Step 2: upload a proof
+
+Use the token returned in Step 1.
+
+You can upload your proof via Postman (change the key to "File").
+
+### Step 3: get your file ready
+
+If the data comes in different files, use the `merge_two_csv_files.py` script (details below).
+
+The file must be a `.csv`.
+
+### Step 4: upload your file
+
+#### For each location
+
+Depending on the source, you'll need to provide the correct `LOCATION` key, and provide the corresponding `LOCATION_OSM_ID` & `LOCATION_OSM_TYPE`. You can use https://www.openstreetmap.org/ to pinpoint the corresponding places.
+
+#### Upload command
+
+Use the token returned in Step 1.
+
+```
+FILEPATH=../data/Carrefour/Carte_Carrefour_NAME_merged.csv SOURCE=CARREFOUR LOCATION="City Jaures Grenoble" LOCATION_OSM_ID=1697821864 LOCATION_OSM_TYPE=NODE PROOF_ID=1234 API_ENDPOINT=https://prices.openfoodfacts.net/api/v1 API_TOKEN=username_token-hash poetry run python data/gdpr/create_prices_from_gdpr_csv.py
+```
+
+Last changes when you're ready:
+- replace the API_ENDPOINT with `https://prices.openfoodfacts.org/api/v1`
+-  `DRY_RUN=False` to actually upload your data
+
+## Other tools
+
+### Merge two csv files
+
+Script name: `merge_two_csv_files.csv`
+
+E.Leclerc returns 2 different files, one containing a list of receipts (with dates & locations), and the other a list of products with their receipt id. So we need to first merge the 2 files into 1.
+```
+(TODO)
+```
+
+For Carrefour, the file contains 2 tabs, 1 called "Tickets" and the other called "Remise".
+```
+FILEPATH_1=Carte_Carrefour_NAME_liste_tickets_Tickets.csv FILEPATH_2=Carte_Carrefour_NAME_liste_tickets_Remises.csv PIVOT_FIELD_NAME="Numéro du ticket de caisse magasin,Code Barre du produit,Description du produit" poetry run python data/gdpr/merge_two_csv_files.py
+```
diff --git a/data/gdpr/create_prices_from_gdpr_csv.py b/data/gdpr/create_prices_from_gdpr_csv.py
@@ -15,6 +15,7 @@
     "product_code",
     "product_name",
     "price",
+    "discount",  # extra
     "quantity",  # extra
     "currency",
     "location",  # extra
@@ -25,13 +26,13 @@
 
 REQUIRED_ENV_PARAMS = [
     # "FILEPATH"
-    "API_TOKEN",
-    "API_ENDPOINT",
     "SOURCE",
     "LOCATION",
     "LOCATION_OSM_ID",
     "LOCATION_OSM_TYPE",
     "PROOF_ID",
+    "API_ENDPOINT",
+    "API_TOKEN",
     # DRY_MODE
 ]
 
@@ -118,6 +119,10 @@ def gdpr_source_filter_rules(op_price_list, gdpr_source=""):
                 passes_test = False
             elif op_price["product_name"] in ["BOUCHERIE", "Coupon Rem Caisse"]:
                 passes_test = False
+            elif op_price["discount"]:
+                passes_test = False
+            elif op_price["quantity"].startswith("-"):
+                passes_test = False
         elif gdpr_source == "ELECLERC":
             if len(op_price["product_code"]) < 6:
                 passes_test = False
@@ -218,10 +223,14 @@ def create_price(price):
     Required params: see REQUIRED_ENV_PARAMS
     """
     # Step 1: read input file
+    if not os.environ.get("FILEPATH"):
+        sys.exit("Error: missing FILEPATH env")
     filepath = os.environ.get("FILEPATH")
     print(f"===== Reading {filepath}")
     gdpr_price_list = read_gdpr_csv(filepath)
     print(len(gdpr_price_list))
+
+    print("===== Input example:")
     print(gdpr_price_list[0])
 
     # Step 2: check env params are all present
@@ -261,9 +270,8 @@ def create_price(price):
     )
     print(len(open_prices_price_list_filtered_2))
 
+    print("===== Output example (extra fields will be ignored):")
     print(open_prices_price_list_filtered_2[0])
-    # for p in open_prices_price_list_filtered:
-    #     print(p)
 
     # Step 5: send prices to backend via API
     if os.environ.get("DRY_RUN") == "False":

diff --git a/data/gdpr/gdpr_field_mapping.csv b/data/gdpr/gdpr_field_mapping.csv
@@ -2,6 +2,7 @@ OPEN_PRICES_FIELD,AUCHAN_FIELD,AUCHAN_COMMENT,CARREFOUR_FIELD,CARREFOUR_COMMENT,
 product_code,CODE_PRODUIT,"raw products have a length of 4 or 12 or 13 but ending with lots of 0 (fruits, vegetables, meat, cheese) (ex: 4400, 200512000000, 2630329000000)",Code Barre du produit,prefixed and suffixed with |,ean,,COD_ARTC_EAN,duplicate column with EAN_GD
 product_name,NOM_PRODUIT,,Description du produit,,article_libelle,,LB_ARTC,duplicate column with LB_COMM0
 price,PRIX_UNITAIRE,,Prix unitaire TTC avec remise (€),,article_prix_unitaire,,CA TTC Produit,has commas instead of points
-quantity,,,,,,,Qte Vendues,
+discount,,,"Remise sur le produit (€) (chaque remise d'un produit regroupe les promotions, les avantages de la carte PASS,  les bons d'achats…  appliqués lors du passage en caisse)",,,,,
+quantity,,,Quantité,,,,Qte Vendues,
 date,JOUR,format YYYY-MM-DD,Date de transaction,format DD/MM/YYYY,date_ticket,format YYYY-MM-DD,DT_TICK,format DD/MM/YYYY
 location,CODE_POSTAL,,NOM DU MAGASIN,,code_postal,,LB_COMM,
diff --git a/data/gdpr/merge_two_csv_files.py b/data/gdpr/merge_two_csv_files.py
@@ -13,12 +13,20 @@ def read_csv(filepath):
     return data
 
 
-def merge_data_of_two_lists(list_1, list_2, pivot="ticket"):
+def merge_data_of_two_lists(list_1, list_2, pivot_list=["ticket"]):
+    print(pivot_list)
     data_merged = list()
 
     for row_1 in list_1:
-        pivot_1 = row_1[pivot]
-        row_2 = next(row for row in list_2 if row[pivot] == pivot_1)
+        row_2 = None
+        for row in list_2:
+            if all(row_1[pivot] == row[pivot] for pivot in pivot_list):
+                row_2 = row
+        if not row_2:
+            row_2 = {
+                **{key: row_1[key] for key in list_2[0].keys() if key in pivot_list},
+                **{key: "" for key in list_2[0].keys() if key not in pivot_list},
+            }
         data_merged.append({**row_1, **row_2})
 
     return data_merged
@@ -41,6 +49,7 @@ def write_csv(data, filepath):
     filepath_1 = os.environ.get("FILEPATH_1")
     filepath_2 = os.environ.get("FILEPATH_2")
     pivot_field_name = os.environ.get("PIVOT_FIELD_NAME")
+    pivot_field_name_list = pivot_field_name.split(",")
     output_filepath = filepath_1.split(".csv")[0] + "_merged.csv"
 
     print(f"Step 1: reading {filepath_1}")
@@ -51,8 +60,10 @@ def write_csv(data, filepath):
     data_2 = read_csv(filepath_2)
     print(f"{len(data_2)} lines")
 
-    print("Step 3: merging the two lists")
-    data_merged = merge_data_of_two_lists(data_1, data_2, pivot=pivot_field_name)
+    print(f"Step 3: merging the two lists with pivot(s): {pivot_field_name_list}")
+    data_merged = merge_data_of_two_lists(
+        data_1, data_2, pivot_list=pivot_field_name_list
+    )
     print(f"{len(data_merged)} lines")
 
     print("Step 4: write CSV")