Files
wger/extras/scripts/create_ingredients_from_foodfacts.py
2021-10-06 09:56:32 +02:00

222 lines
7.3 KiB
Python

# This file is part of wger Workout Manager.
#
# wger Workout Manager is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# wger Workout Manager is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
from pymongo import MongoClient
import os
import django
import sys
sys.path.insert(0, os.path.join('..', '..'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
django.setup()
from django.conf import settings # noqa: E402
from wger.nutrition.models import Ingredient # noqa: E402
from wger.nutrition.models.ingredient import Source # noqa: E402
from wger.core.models import Language # noqa: E402
"""
Simple script that imports and loads the Open Food Facts database into the
ingredients database.
NOTE: The file is VERY large (17 GB), so it takes a long time (> 3 hours) to run
and create all ingredients.
* Requirements:
pip3 install pymongo zip
apt-get install mongo-tools # (for mongorestore)
* Steps:
wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
tar xzvf openfoodfacts-mongodbdump.tar.gz
# Import
docker pull mongo
docker run -it --name wger_mongo -p 27017:27017 -d \
-e MONGO_INITDB_ROOT_USERNAME=off \
-e MONGO_INITDB_ROOT_PASSWORD=off-wger \
mongo:latest
mongorestore --username off --password off-wger -d admin -c products dump/off/products.bson
# Process
python extras/scripts/create_ingredients_from_foodfacts.py
# Cleanup
docker stop wger_mongo
docker rm wger_mongo
rm openfoodfacts-mongodbdump.tar.gz
rm -r dump
# Update ingredient fixture
python3 manage.py dumpdata nutrition.ingredient > extras/scripts/data.json
cd extras/scripts/
python3 python3 filter-fixtures.py
zip ingredients.json.zip ingredients.json
"""
client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
db = client.admin
# Mode for this script. When using 'insert', the script will bulk-insert the new
# ingredients, which is very efficient. Importing the whole database will require
# barely a minute. When using 'update', existing ingredients will be updated, which
# requires two queries per product.
MODE = 'insert'
languages = {i[0]: Language.objects.get(short_name=i[0]) for i in settings.LANGUAGES}
BULK_SIZE = 500
bulk_update_bucket = []
stats = {'new': 0,
'edited': 0,
'skipped': 0}
# for lang in languages.keys():
# count = db.products.count_documents({'lang': lang, 'complete': 1})
# total = db.products.count_documents({'lang': lang})
# print(f'Lang {lang} has {count} completed products out of {total}')
# Lang de has 15667 completed products out of 52669
# Lang es has 8757 completed products out of 186171
# Lang ru has 505 completed products out of 2918
# Lang fr has 85058 completed products out of 811715
# Lang bg has 15 completed products out of 718
# Lang el has 20 completed products out of 251
# Lang nl has 750 completed products out of 5611
# Lang no has 9 completed products out of 141
# Lang cs has 123 completed products out of 789
# Lang sv has 1340 completed products out of 2878
# Lang pt has 526 completed products out of 3541
print('***********************************')
print(languages.keys())
print('***********************************')
for product in db.products.find({'lang': {"$in": list(languages.keys())}, 'complete': 1}):
lang = product['lang']
main_details = ['product_name', 'code']
if all(req in product for req in main_details):
name = product['product_name']
code = product['code']
# Some products have no name or name is too long, skipping
if not name or len(name) > 200:
# print(f'-> skipping due to name requirements')
stats['skipped'] += 1
continue
# print(f'Processing "{name}"...')
required = ['energy-kcal_100g',
'proteins_100g',
'carbohydrates_100g',
'sugars_100g',
'fat_100g',
'saturated-fat_100g']
if 'nutriments' in product and all(req in product['nutriments'] for req in required):
energy = product['nutriments']['energy-kcal_100g']
protein = product['nutriments']['proteins_100g']
carbs = product['nutriments']['carbohydrates_100g']
sugars = product['nutriments']['sugars_100g']
fat = product['nutriments']['fat_100g']
saturated = product['nutriments']['saturated-fat_100g']
else:
# print(f'-> skipping due to required nutriments')
stats['skipped'] += 1
continue
# these are optional
sodium = product['nutriments'].get('sodium_100g', None)
fibre = product['nutriments'].get('fiber_100g', None)
common_name = product.get('generic_name', None)
brand = product.get('brands', None)
if common_name and len(common_name) > 200:
continue
source_name = Source.OPEN_FOOD_FACTS.value
source_url = f'https://world.openfoodfacts.org/api/v0/product/{code}.json'
ingredient_data = {
'language': languages[lang],
'name': name,
'energy': energy,
'protein': protein,
'carbohydrates': carbs,
'carbohydrates_sugar': sugars,
'fat': fat,
'fat_saturated': saturated,
'fibres': fibre,
'sodium': sodium,
'code': code,
'source_name': source_name,
'source_url': source_url,
'common_name': common_name,
'brand': brand,
'status': 2,
'license_id': 5,
'license_author': Source.OPEN_FOOD_FACTS.value,
}
# Add entries as new products
if MODE == 'insert':
bulk_update_bucket.append(Ingredient(**ingredient_data))
if len(bulk_update_bucket) > BULK_SIZE:
try:
Ingredient.objects.bulk_create(bulk_update_bucket)
print('***** Bulk adding products *****')
except Exception as e:
print('--> Error while saving the product bucket. Saving individually')
print(e)
# Try saving the ingredients individually as most will be correct
for ingredient in bulk_update_bucket:
try:
ingredient.save()
# ¯\_(ツ)_/¯
except Exception:
pass
stats['new'] += BULK_SIZE
bulk_update_bucket = []
# Update existing entries
else:
try:
# Update an existing product (look-up key is the code) or create a new
# one. While this might not be the most efficient query (there will always
# be a SELECT first), it's ok because this script is run very rarely.
obj, created = Ingredient.objects.update_or_create(code=code, defaults=ingredient_data)
if created:
stats['new'] += 1
# print('-> added to the database')
else:
stats['edited'] += 1
# print('-> updated')
except: # noqa: E722
continue
print('***********************************')
print(stats)
print('***********************************')