From 1e1ca99ddf001e7aa5a2a217e0c4c78c4c5b9f85 Mon Sep 17 00:00:00 2001 From: Charles Nepote Date: Wed, 4 Dec 2024 15:07:00 +0100 Subject: [PATCH] Major update --- scripts/mirabelle/empty2null.sh | 51 +++++ scripts/mirabelle/products.schema.2024-01-18 | 210 +++++++++++++++++++ scripts/mirabelle/products_daily_update.sh | 156 +++++++++++--- 3 files changed, 384 insertions(+), 33 deletions(-) create mode 100755 scripts/mirabelle/empty2null.sh create mode 100644 scripts/mirabelle/products.schema.2024-01-18 diff --git a/scripts/mirabelle/empty2null.sh b/scripts/mirabelle/empty2null.sh new file mode 100755 index 00000000..51249bbb --- /dev/null +++ b/scripts/mirabelle/empty2null.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# usage: ./empty2null.sh mydb.db + +teefile=tmp.txt + +function empty2null() { + for t in $( +sqlite3 "$db" "SELECT name FROM sqlite_master +WHERE type=='table' ORDER BY name" | tee ${teefile} ) + do + nc=0 + onec='' + mulc='' + where='WHERE' + cma="UPDATE OR ABORT \"${t}\" SET + " + for c in $( +printf "SELECT '\"'||name||'\"' FROM pragma_table_info('%s') +WHERE type == 'INTEGER' or type == 'FLOAT' ORDER BY cid ASC;\\n" "$t" \ + | sqlite3 "$db" | tee -a ${teefile} ) + do + onec+=$(printf "%s %s = NULL" "$cma" "$c") + mulc+=$(printf "%s %s = IIF(%s=='',NULL,%s)" "$cma" "$c" "$c" "$c") + cma=' +,' + test $nc -gt 0 && where+=' OR' + where+=" $c==''" + nc=$(( nc + 1 )) + done + if [ $nc -gt 0 ] + then + if [ $nc -gt 1 ] + then + printf '%s\n %s;\n' "$mulc" "${where}" + else + printf '%s\n %s;\n' "$onec" "${where}" + fi + fi + done +} + +db=$1 + +echo "$(date +'%Y-%m-%dT%H:%M:%S') - Building query" +sql=$(empty2null) +echo "$sql" + +echo "$(date +'%Y-%m-%dT%H:%M:%S') - Converting..." +sqlite3 "$db" "$sql" +echo "$(date +'%Y-%m-%dT%H:%M:%S') - ... converting empty to NULL ended" diff --git a/scripts/mirabelle/products.schema.2024-01-18 b/scripts/mirabelle/products.schema.2024-01-18 new file mode 100644 index 00000000..47162d1f --- /dev/null +++ b/scripts/mirabelle/products.schema.2024-01-18 @@ -0,0 +1,210 @@ +/* head -n 1 en.openfoodfacts.org.products.csv | /usr/local/bin/vd -f tsv */; +/* head -n 1 en.openfoodfacts.org.products.csv | tr "\t" "\n" | wc -l */; +CREATE TABLE IF NOT EXISTS "all" ( + [code] TEXT, + [url] TEXT, + [creator] TEXT, + [created_t] INTEGER, + [created_datetime] TEXT, + [last_modified_t] INTEGER, + [last_modified_datetime] TEXT, + [last_modified_by] TEXT, + [last_updated_t] INTEGER, + [last_updated_datetime] TEXT, + [product_name] TEXT, + [abbreviated_product_name] TEXT, + [generic_name] TEXT, + [quantity] TEXT, + [packaging] TEXT, + [packaging_tags] TEXT, + [packaging_en] TEXT, + [packaging_text] TEXT, + [brands] TEXT, + [brands_tags] TEXT, + [categories] TEXT, + [categories_tags] TEXT, + [categories_en] TEXT, + [origins] TEXT, + [origins_tags] TEXT, + [origins_en] TEXT, + [manufacturing_places] TEXT, + [manufacturing_places_tags] TEXT, + [labels] TEXT, + [labels_tags] TEXT, + [labels_en] TEXT, + [emb_codes] TEXT, + [emb_codes_tags] TEXT, + [first_packaging_code_geo] TEXT, + [cities] TEXT, + [cities_tags] TEXT, + [purchase_places] TEXT, + [stores] TEXT, + [countries] TEXT, + [countries_tags] TEXT, + [countries_en] TEXT, + [ingredients_text] TEXT, + [ingredients_tags] TEXT, + [ingredients_analysis_tags] TEXT, + [allergens] TEXT, + [allergens_en] TEXT, + [traces] TEXT, + [traces_tags] TEXT, + [traces_en] TEXT, + [serving_size] TEXT, + [serving_quantity] FLOAT, + [no_nutrition_data] INTEGER, + [additives_n] INTEGER, + [additives] TEXT, + [additives_tags] TEXT, + [additives_en] TEXT, + [nutriscore_score] INTEGER, + [nutriscore_grade] TEXT, + [nova_group] INTEGER, + [pnns_groups_1] TEXT, + [pnns_groups_2] TEXT, + [food_groups] TEXT, + [food_groups_tags] TEXT, + [food_groups_en] TEXT, + [states] TEXT, + [states_tags] TEXT, + [states_en] TEXT, + [brand_owner] TEXT, + [ecoscore_score] FLOAT, + [ecoscore_grade] TEXT, + [nutrient_levels_tags] TEXT, + [product_quantity] TEXT, + [owner] TEXT, + [data_quality_errors_tags] TEXT, + [unique_scans_n] INTEGER, + [popularity_tags] TEXT, + [completeness] FLOAT, + [last_image_t] INTEGER, + [last_image_datetime] TEXT, + [main_category] TEXT, + [main_category_en] TEXT, + [image_url] TEXT, + [image_small_url] TEXT, + [image_ingredients_url] TEXT, + [image_ingredients_small_url] TEXT, + [image_nutrition_url] TEXT, + [image_nutrition_small_url] TEXT, + [energy-kj_100g] FLOAT, + [energy-kcal_100g] FLOAT, + [energy_100g] FLOAT, + [energy-from-fat_100g] FLOAT, + [fat_100g] FLOAT, + [saturated-fat_100g] FLOAT, + [butyric-acid_100g] FLOAT, + [caproic-acid_100g] FLOAT, + [caprylic-acid_100g] FLOAT, + [capric-acid_100g] FLOAT, + [lauric-acid_100g] FLOAT, + [myristic-acid_100g] FLOAT, + [palmitic-acid_100g] FLOAT, + [stearic-acid_100g] FLOAT, + [arachidic-acid_100g] FLOAT, + [behenic-acid_100g] FLOAT, + [lignoceric-acid_100g] FLOAT, + [cerotic-acid_100g] FLOAT, + [montanic-acid_100g] FLOAT, + [melissic-acid_100g] FLOAT, + [unsaturated-fat_100g] FLOAT, + [monounsaturated-fat_100g] FLOAT, + [omega-9-fat_100g] FLOAT, + [polyunsaturated-fat_100g] FLOAT, + [omega-3-fat_100g] FLOAT, + [omega-6-fat_100g] FLOAT, + [alpha-linolenic-acid_100g] FLOAT, + [eicosapentaenoic-acid_100g] FLOAT, + [docosahexaenoic-acid_100g] FLOAT, + [linoleic-acid_100g] FLOAT, + [arachidonic-acid_100g] FLOAT, + [gamma-linolenic-acid_100g] FLOAT, + [dihomo-gamma-linolenic-acid_100g] FLOAT, + [oleic-acid_100g] FLOAT, + [elaidic-acid_100g] FLOAT, + [gondoic-acid_100g] FLOAT, + [mead-acid_100g] FLOAT, + [erucic-acid_100g] FLOAT, + [nervonic-acid_100g] FLOAT, + [trans-fat_100g] FLOAT, + [cholesterol_100g] FLOAT, + [carbohydrates_100g] FLOAT, + [sugars_100g] FLOAT, + [added-sugars_100g] FLOAT, + [sucrose_100g] FLOAT, + [glucose_100g] FLOAT, + [fructose_100g] FLOAT, + [lactose_100g] FLOAT, + [maltose_100g] FLOAT, + [maltodextrins_100g] FLOAT, + [starch_100g] FLOAT, + [polyols_100g] FLOAT, + [erythritol_100g] FLOAT, + [fiber_100g] FLOAT, + [soluble-fiber_100g] FLOAT, + [insoluble-fiber_100g] FLOAT, + [proteins_100g] FLOAT, + [casein_100g] FLOAT, + [serum-proteins_100g] FLOAT, + [nucleotides_100g] FLOAT, + [salt_100g] FLOAT, + [added-salt_100g] FLOAT, + [sodium_100g] FLOAT, + [alcohol_100g] FLOAT, + [vitamin-a_100g] FLOAT, + [beta-carotene_100g] FLOAT, + [vitamin-d_100g] FLOAT, + [vitamin-e_100g] FLOAT, + [vitamin-k_100g] FLOAT, + [vitamin-c_100g] FLOAT, + [vitamin-b1_100g] FLOAT, + [vitamin-b2_100g] FLOAT, + [vitamin-pp_100g] FLOAT, + [vitamin-b6_100g] FLOAT, + [vitamin-b9_100g] FLOAT, + [folates_100g] FLOAT, + [vitamin-b12_100g] FLOAT, + [biotin_100g] FLOAT, + [pantothenic-acid_100g] FLOAT, + [silica_100g] FLOAT, + [bicarbonate_100g] FLOAT, + [potassium_100g] FLOAT, + [chloride_100g] FLOAT, + [calcium_100g] FLOAT, + [phosphorus_100g] FLOAT, + [iron_100g] FLOAT, + [magnesium_100g] FLOAT, + [zinc_100g] FLOAT, + [copper_100g] FLOAT, + [manganese_100g] FLOAT, + [fluoride_100g] FLOAT, + [selenium_100g] FLOAT, + [chromium_100g] FLOAT, + [molybdenum_100g] FLOAT, + [iodine_100g] FLOAT, + [caffeine_100g] FLOAT, + [taurine_100g] FLOAT, + [ph_100g] FLOAT, + [fruits-vegetables-nuts_100g] FLOAT, + [fruits-vegetables-nuts-dried_100g] FLOAT, + [fruits-vegetables-nuts-estimate_100g] FLOAT, + [fruits-vegetables-nuts-estimate-from-ingredients_100g] FLOAT, + [collagen-meat-protein-ratio_100g] FLOAT, + [cocoa_100g] FLOAT, + [chlorophyl_100g] FLOAT, + [carbon-footprint_100g] FLOAT, + [carbon-footprint-from-meat-or-fish_100g] FLOAT, + [nutrition-score-fr_100g] FLOAT, + [nutrition-score-uk_100g] FLOAT, + [glycemic-index_100g] FLOAT, + [water-hardness_100g] FLOAT, + [choline_100g] FLOAT, + [phylloquinone_100g] FLOAT, + [beta-glucan_100g] FLOAT, + [inositol_100g] FLOAT, + [carnitine_100g] FLOAT, + [sulphate_100g] FLOAT, + [nitrate_100g] FLOAT, + [acidity_100g] FLOAT +); diff --git a/scripts/mirabelle/products_daily_update.sh b/scripts/mirabelle/products_daily_update.sh index 277f7b83..5c85a8e9 100644 --- a/scripts/mirabelle/products_daily_update.sh +++ b/scripts/mirabelle/products_daily_update.sh @@ -1,13 +1,17 @@ #!/bin/bash path=/home/off/mirabelle -cd $path +cd $path || { echo "$(date +'%Y-%m-%dT%H:%M:%S') - Can't cd ${path}"; exit 1; } mode="" file="" +stats="" +TODAY=$(date "+%Y-%m-%d") +#csv="en.openfoodfacts.org.products.csv" +url="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv" -# Read commandline arguments. Try -h to get usage. -usage() { echo "$0 usage:" && grep " .)\ #" $0; exit 0; } -while getopts ":hf:i" arg; do +# Read commanline arguments. Try -h to get usage. +usage() { echo "$0 usage:" && grep " .)\ #" "$0"; exit 0; } +while getopts ":hf:in" arg; do case $arg in f) # Specify a filename. file=${OPTARG} @@ -16,6 +20,9 @@ while getopts ":hf:i" arg; do i) # Specify interactive mode. mode="i" ;; + n) # don't run stats. + stats="s" + ;; h | *) # Display help. usage exit 0 @@ -24,7 +31,6 @@ while getopts ":hf:i" arg; do done -TODAY=`date "+%Y-%m-%d"` export PATH="/usr/local/bin:$PATH" [[ $mode == "i" ]] && read -p "Press [Enter] key to begin the script in interactive mode." @@ -37,72 +43,154 @@ echo "$(date +'%Y-%m-%dT%H:%M:%S') - Old CSV weights $old_csv bytes for $old_csv # Choose CSV depending on the command line -f argument. -if [[ ${file} == "" ]]; then - if [[ `date -r en.openfoodfacts.org.products.csv "+%Y-%m-%d"` == ${TODAY} ]]; then +if [[ "${file}" == "" ]]; then + # Don't download the file again if it has already been done today. + if [[ $(date -r en.openfoodfacts.org.products.csv "+%Y-%m-%d") == "${TODAY}" ]]; then echo "$(date +'%Y-%m-%dT%H:%M:%S') - CSV file has already been downloaded today. Copying it..." cp en.openfoodfacts.org.products.csv newdata.csv - else - [[ $mode == "i" ]] && read -p "Press [Enter] key to download CSV..." + else # Else download the file + [[ "$mode" == "i" ]] && read -p "Press [Enter] key to download CSV..." echo "$(date +'%Y-%m-%dT%H:%M:%S') - Downloading CSV..." - wget -c https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv -O newdata.csv - [[ $? -ne 0 ]] && { echo "Download failed, error $?"; exit 1; } - printf "ls\n$(ls -la newdata.csv)\n\n" + + # Get the date of the file to be downloaded. Retry 24 times until the date is not corresponding to today. + export_date=$(curl -L -s -v -X HEAD https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv 2>&1 | grep -ioP '.*Last-Modified: \K(.*)$' | tr -d '\r') + # We don't use .gz file because it is generated dozens of minutes later + counter=1 + # Trying to download the file. Attention: sometimes curl command does not return anything. + while [[ ${export_date} == "" || "$(date -d "${export_date}" +%Y-%m-%d)" < "${TODAY}" ]]; do + # Exit after 60 tries. + [[ "${counter}" -gt 60 ]] && { echo "$(date +'%Y-%m-%dT%H:%M:%S') - Retried ${counter} times, but en.openfoodfacts.org.products.csv is from ${export_date} and not ${TODAY}. Now exit..." | tee >(cat >&2); exit 1; } + ((counter++)) + echo "$(date +'%Y-%m-%dT%H:%M:%S') - CSV export is from ${export_date} and not today (${TODAY}). Retrying ${counter} in 10 minutes" + sleep 10m + export_date=$(curl -L -s -v -X HEAD https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv 2>&1 | grep -ioP '.*Last-Modified: \K(.*)$' | tr -d '\r') + done + + echo "$(date +'%Y-%m-%dT%H:%M:%S') - Remote CSV is from ${export_date}. Downloading CSV..." + #wget -O newdata.csv \ # name of the ouput file + # --quiet \ # Turn off Wget's output. + # --server-response \ # Print the headers sent by HTTP servers and responses sent by FTP servers. + # --continue \ # Compression does not work with --continue or --start-pos, they will be disabled. + # https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv + wget -O newdata.csv \ + --quiet \ + \ #--server-response \ + --compression=gzip \ + \ #--header="accept-encoding: gzip" \ + https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv + echo -e "ls -la newdata.csv\n$(ls -la newdata.csv)\n" + # compare local and remote size to be sure download is ok + #source_size=$(wget --spider "${url}" 2>&1 | awk '/Length/ {print $2}') + #source_size=$(curl -s -v -X HEAD "${url}" 2>&1 | grep -ioP '.*content-length: \K(.*)$' | tr -d '\r') + #downloaded_size=$(wc -c < "newdata.csv") + #[[ ${source_size} -ne ${downloaded_size} ]] && { echo "Source size (${source_size}) is different from downloaded file size (${downloaded_size})..."; exit 1; } \ + # || echo "Source size (${source_size}) is equal to downloaded file size (${downloaded_size})..." fi else [[ $mode == "i" ]] && read -p "Press [Enter] key to use ${file} as source of the new DB." # Use -f filename as source file. Exit if the file can't be copied. echo "$(date +'%Y-%m-%dT%H:%M:%S') - Use -f filename (${file}) as source file." - cp ${file} newdata.csv || { echo "$(date +'%Y-%m-%dT%H:%M:%S') - cp error"; exit 1; } + cp "${file}" newdata.csv || { echo "$(date +'%Y-%m-%dT%H:%M:%S') - cp error"; exit 1; } fi +# TODO: verify number of fields is not different from current database +# nb_of_fields=$(head -n 1 newdata.csv | tr "\t" "\n" | wc -l) + # Display information on source CSV [[ $mode == "i" ]] && read -p "Press [Enter] key to compare CSV from today and yesterday..." new_csv=$(wc -c newdata.csv | awk '{print $1}') new_csv_lines=$(wc -l newdata.csv | awk '{print $1}') echo "$(date +'%Y-%m-%dT%H:%M:%S') - New CSV weights $new_csv bytes for $new_csv_lines lines" +echo "$(date +'%Y-%m-%dT%H:%M:%S') - Old CSV size $old_csv_lines" # Create a temporary DB if today's CSV is bigger than yesterday -if [[ "$new_csv" -ge "$old_csv" ]]; then - [[ $mode == "i" ]] && read -p "Press [Enter] key to create new db..." +# TODO: better test new CSV (find newest product?) +if (( "$new_csv_lines" > "$old_csv_lines"-5000 )); then + [[ "$mode" == "i" ]] && read -p "Press [Enter] key to create new db..." mv -f en.openfoodfacts.org.products.csv en.openfoodfacts.org.products.csv.bak mv newdata.csv en.openfoodfacts.org.products.csv echo "$(date +'%Y-%m-%dT%H:%M:%S') - Creating new DB..." # Create DB and import CSV data - time sqlite3 products_new.db < exit @@ -111,13 +199,15 @@ TODAY_DB=$(sqlite3 products_new.db "select count(code) from [all];") # Backup the old DB and replace it by the new one -echo "$(date +'%Y-%m-%dT%H:%M:%S') - Backup the old DB and replace it by the new one" +echo "$(date +'%Y-%m-%dT%H:%M:%S') - Backup the old DB and replace it by the new one" mv products.db previous.db mv products_new.db products.db +printf "\n\n" # Launch script to build data quality stats. -time $path/data-quality.sh +[[ ${stats} != "s" ]] && echo "$(date +'%Y-%m-%dT%H:%M:%S') - Build data quality stats" || echo "$(date +'%Y-%m-%dT%H:%M:%S') - Don't build data quality stats" +[[ ${stats} != "s" ]] && $path/data-quality.sh off-stats.db # Restart mirabelle server @@ -135,4 +225,4 @@ sudo $path/clear_cache.sh "products" # curl ........; -echo "$(date +'%Y-%m-%dT%H:%M:%S') - END of script" +echo -e "$(date +'%Y-%m-%dT%H:%M:%S') - END of script\n"