From 0515e03647307459bc48706dbc4dec3dcd7c1692 Mon Sep 17 00:00:00 2001 From: Roland Geider Date: Sat, 25 Nov 2023 10:57:46 +0100 Subject: [PATCH] Simplify OFF import script This now is automatically setup with docker compose, also the script has been moved to a management command, which is cleaner --- .gitignore | 3 + extras/docker/open-food-facts/README.md | 52 ++++ .../docker/open-food-facts/docker-compose.yml | 27 ++ extras/docker/open-food-facts/dump/.empty | 0 .../create_ingredients_from_foodfacts.py | 234 ------------------ wger/core/models/profile.py | 4 +- .../commands/import-off-products.py | 183 ++++++++++++++ 7 files changed, 267 insertions(+), 236 deletions(-) create mode 100644 extras/docker/open-food-facts/README.md create mode 100644 extras/docker/open-food-facts/docker-compose.yml create mode 100644 extras/docker/open-food-facts/dump/.empty delete mode 100644 extras/scripts/create_ingredients_from_foodfacts.py create mode 100644 wger/nutrition/management/commands/import-off-products.py diff --git a/.gitignore b/.gitignore index ec0326d8e..3f2ebf49d 100644 --- a/.gitignore +++ b/.gitignore @@ -75,3 +75,6 @@ venv-wger /wger/app_en.arb /coverage.lcov /media/ +/static/ +/extras/docker/open-food-facts/dump/*.tar.gz +/extras/docker/open-food-facts/dump/off/* diff --git a/extras/docker/open-food-facts/README.md b/extras/docker/open-food-facts/README.md new file mode 100644 index 000000000..7d626f3b8 --- /dev/null +++ b/extras/docker/open-food-facts/README.md @@ -0,0 +1,52 @@ +# Import Open Food Facts products + +This docker compose helps import or update products from the Open Food Facts +database into wger. + +Note that the OFF database dump is very large, and you will need several times +this size available on your computer (tar.gz-file, extracted dump, mongo). + +## 1 + +Download a current dump of their database + +```shell +cd dump +wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz +tar xzvf openfoodfacts-mongodbdump.tar.gz +``` + +## 2 + +Import the data into mongo. + +Note that we are running this as a manual step since the import takes a while + +```shell +docker compose up +docker compose exec mongorestore --username off --password off-wger -d admin -c products /dump/off/products.bson +``` + +There is an admin interface available at , log in with +these credentials: + +* admin +* pass + +## 3 + +Run the import script + +```shell +python manage.py import-off-products +``` + +## 4 + +Don't forget to delete the dump and remove the containers if you love your +hard disk + +```shell +docker compose down +rm dump -r openfoodfacts-mongodbdump.tar.gz dump/off +``` diff --git a/extras/docker/open-food-facts/docker-compose.yml b/extras/docker/open-food-facts/docker-compose.yml new file mode 100644 index 000000000..a4eb26e7b --- /dev/null +++ b/extras/docker/open-food-facts/docker-compose.yml @@ -0,0 +1,27 @@ +# +# Please consult the `Deployment` section in the readme if you want to deploy +# this. You *need* to keep this nginx service, even if you have your own, +# otherwise the static files will not be served correctly! If you do remove +# it, configure yours similarly to what's in config/nginx.conf +# Also take a look at the "Static files" section in the .env file + +services: + mongodb: + image: mongo + ports: + - "27017:27017" + volumes: + - $PWD/dump:/dump + environment: + MONGO_INITDB_ROOT_USERNAME: off + MONGO_INITDB_ROOT_PASSWORD: off-wger + + mongo-express: + image: mongo-express + restart: always + ports: + - 8081:8081 + environment: + ME_CONFIG_MONGODB_ADMINUSERNAME: off + ME_CONFIG_MONGODB_ADMINPASSWORD: off-wger + ME_CONFIG_MONGODB_URL: mongodb://off:off-wger@mongodb:27017/ diff --git a/extras/docker/open-food-facts/dump/.empty b/extras/docker/open-food-facts/dump/.empty new file mode 100644 index 000000000..e69de29bb diff --git a/extras/scripts/create_ingredients_from_foodfacts.py b/extras/scripts/create_ingredients_from_foodfacts.py deleted file mode 100644 index a71a32c54..000000000 --- a/extras/scripts/create_ingredients_from_foodfacts.py +++ /dev/null @@ -1,234 +0,0 @@ -# This file is part of wger Workout Manager. -# -# wger Workout Manager is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# wger Workout Manager is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License - -from collections import Counter -import enum - -from pymongo import MongoClient -import os -import django -import sys - -sys.path.insert(0, os.path.join('..', '..')) -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") -django.setup() -from django.conf import settings # noqa: E402 - -from wger.nutrition.models import Ingredient # noqa: E402 -from wger.nutrition.off import extract_info_from_off -from wger.core.models import Language # noqa: E402 - -""" -Simple script that imports and loads the Open Food Facts database into the -ingredients database. - -NOTE: The file is VERY large (40 GB), so it takes a long time (> 3 hours) to -import the data and create all the ingredients. - - -* Requirements: - (note that the local mongo version needs to be compatible with the one used to - create the dump, otherwise the indices won't be compatible, it is best to use - a newer version than the one found in the ubuntu/debian repos) - - - MongoDB - https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/ - - - Docker - snap install docker - -pip3 install pymongo -apt-get install mongo-tools zip - -* Steps: -wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz -tar xzvf openfoodfacts-mongodbdump.tar.gz - -# Import -docker pull mongo -docker run -it --name wger_mongo -p 27017:27017 -d \ - -e MONGO_INITDB_ROOT_USERNAME=off \ - -e MONGO_INITDB_ROOT_PASSWORD=off-wger \ - mongo:latest -mongorestore --username off --password off-wger -d admin -c products dump/off/products.bson - -# Process -python extras/scripts/create_ingredients_from_foodfacts.py - -# Cleanup -docker stop wger_mongo -docker rm wger_mongo -rm openfoodfacts-mongodbdump.tar.gz -rm -r dump - -# Update ingredient fixture -python3 manage.py dumpdata nutrition.ingredient > extras/scripts/data.json -cd extras/scripts/ -python3 filter-fixtures.py -zip ingredients.json.zip ingredients.json -""" - -client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017) -db = client.admin - - -# Mode for this script. When using 'insert', the script will bulk-insert the new -# ingredients, which is very efficient. Importing the whole database will require -# barely a minute. When using 'update', existing ingredients will be updated, which -# requires two queries per product. -class Mode(enum.Enum): - INSERT = enum.auto() - UPDATE = enum.auto() - - -MODE = Mode.UPDATE -BULK_SIZE = 500 - -# The completeness is a value between 0 and 1.1 and shows how much product information -# is in the open food facts DB -COMPLETENESS = 0.7 - - -# Get some completeness statistics -# for lang in ['de', 'es', 'en']: -# count = db.products.count_documents({'lang': lang, 'completeness': {"$gt": COMPLETENESS}}) -# # count = db.products.count_documents({'lang': lang, 'complete': 1}) -# total = db.products.count_documents({'lang': lang}) -# print(f'Lang {lang} has {count} completed products out of {total}') -# import sys -# sys.exit() - -# Completed ingredients as of 2023-04-08 -# -# Lang az has 0 completed products out of 40 -# Lang id has 6 completed products out of 981 -# Lang cs has 80 completed products out of 4654 -# Lang de has 1025 completed products out of 165771 -# Lang en has 1028 completed products out of 929003 -# Lang es has 618 completed products out of 303937 -# Lang eo has 0 completed products out of 9 -# Lang fr has 7125 completed products out of 1160796 -# Lang hr has 57 completed products out of 1699 -# Lang it has 870 completed products out of 216046 -# Lang nl has 71 completed products out of 11744 -# Lang no has 4 completed products out of 261 -# Lang pl has 48 completed products out of 7145 -# Lang pt has 127 completed products out of 9802 -# Lang sv has 308 completed products out of 4622 -# Lang tr has 3 completed products out of 1296 -# Lang el has 6 completed products out of 927 -# Lang bg has 31 completed products out of 4524 -# Lang ru has 23 completed products out of 11683 -# Lang uk has 2 completed products out of 604 -# Lang he has 0 completed products out of 365 -# Lang ar has 1 completed products out of 3552 -# Lang fa has 0 completed products out of 575 -# Lang zh has 2 completed products out of 935 - -# 2023-11-22 -# completeness > 0.7 -# Lang de has 47605 completed products out of 195056 -# Lang es has 23187 completed products out of 311365 -# Lang en has 39555 completed products out of 979534 - -def main(): - languages = {l.short_name: l for l in Language.objects.all()} - - bulk_update_bucket = [] - counter = Counter() - - print('***********************************') - print(languages.keys()) - print('***********************************') - - for product in db.products.find({ - 'lang': {"$in": list(languages.keys())}, - 'completeness': {"$gt": COMPLETENESS} - }): - - try: - ingredient_data = extract_info_from_off(product, languages[product['lang']]) - except KeyError as e: - print('--> KeyError while extracting info from OFF', e) - counter['skipped'] += 1 - continue - - # Some products have no name or name is too long, skipping - if not ingredient_data['name']: - counter['skipped'] += 1 - continue - - if not ingredient_data['common_name']: - counter['skipped'] += 1 - continue - - # - # Add entries as new products - if MODE == Mode.INSERT: - bulk_update_bucket.append(Ingredient(**ingredient_data)) - if len(bulk_update_bucket) > BULK_SIZE: - try: - Ingredient.objects.bulk_create(bulk_update_bucket) - print('***** Bulk adding products *****') - except Exception as e: - print('--> Error while saving the product bucket. Saving individually') - print(e) - - # Try saving the ingredients individually as most will be correct - for ingredient in bulk_update_bucket: - try: - ingredient.save() - - # ¯\_(ツ)_/¯ - except Exception as e: - print('--> Error while saving the product individually') - print(e) - - counter['new'] += BULK_SIZE - bulk_update_bucket = [] - - # Update existing entries - else: - try: - - # Update an existing product (look-up key is the code) or create a new - # one. While this might not be the most efficient query (there will always - # be a SELECT first), it's ok because this script is run very rarely. - obj, created = Ingredient.objects.update_or_create( - code=ingredient_data['code'], - defaults=ingredient_data - ) - - if created: - counter['new'] += 1 - # print('-> added to the database') - else: - counter['edited'] += 1 - # print('-> updated') - - except Exception as e: - print('--> Error while performing update_or_create') - print(f' ingredient: {ingredient_data["name"]}') - print(e) - print(ingredient_data) - counter['error'] += 1 - continue - - print('***********************************') - print(counter) - print('***********************************') - - -if __name__ == "__main__": - main() diff --git a/wger/core/models/profile.py b/wger/core/models/profile.py index 8edda7b73..ba7ca646f 100644 --- a/wger/core/models/profile.py +++ b/wger/core/models/profile.py @@ -523,8 +523,8 @@ by the US Department of Agriculture. It is extremely complete, with around """ if ( not WeightEntry.objects.filter(user=self.user).exists() or ( - datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date - > datetime.timedelta(days=3) + datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date > + datetime.timedelta(days=3) ) ): entry = WeightEntry() diff --git a/wger/nutrition/management/commands/import-off-products.py b/wger/nutrition/management/commands/import-off-products.py new file mode 100644 index 000000000..4a5b60649 --- /dev/null +++ b/wger/nutrition/management/commands/import-off-products.py @@ -0,0 +1,183 @@ +# This file is part of wger Workout Manager. +# +# wger Workout Manager is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# wger Workout Manager is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License + +# Standard Library +import enum +import logging +from collections import Counter + +# Django +from django.core.management.base import BaseCommand + +# wger +from wger.core.models import Language +from wger.nutrition.models import Ingredient +from wger.nutrition.off import extract_info_from_off + + +logger = logging.getLogger(__name__) + + +# Mode for this script. When using 'insert', the script will bulk-insert the new +# ingredients, which is very efficient. Importing the whole database will require +# barely a minute. When using 'update', existing ingredients will be updated, which +# requires two queries per product. +class Mode(enum.Enum): + INSERT = enum.auto() + UPDATE = enum.auto() + + +class Command(BaseCommand): + """ + Import an Open Food facts Dump + """ + mode = Mode.UPDATE + bulk_size = 500 + completeness = 0.7 + + help = 'Import an Open Food Facts dump. Please consult extras/docker/open-food-facts' + + def add_arguments(self, parser): + parser.add_argument( + '--set-mode', + action='store', + default=10, + dest='mode', + type=str, + help='Script mode, "insert" or "update". Insert will insert the ingredients as new ' + 'entries in the database, while update will try to update them if they are ' + 'already present. Deault: insert' + ) + parser.add_argument( + '--completeness', + action='store', + default=0.7, + dest='completeness', + type=float, + help='Completeness threshold for importing the products. Products in OFF have ' + 'completeness score that ranges from 0 to 1.1' + ) + + def handle(self, **options): + try: + # Third Party + from pymongo import MongoClient + except ImportError: + self.stdout.write('Please install pymongo, `pip install pymongo`') + return + + if options['mode'] == 'insert': + self.mode = Mode.INSERT + + if options['completeness'] < 0 or options['completeness'] > 1.1: + self.stdout.write('Completeness must be between 0 and 1.1') + return + self.completeness = options['completeness'] + + self.stdout.write(self.style.SUCCESS('Importing entries from Open Food Facts')) + self.stdout.write(self.style.SUCCESS(f' - Completeness threshold: {self.completeness}')) + self.stdout.write(self.style.SUCCESS(f' - Mode: {self.mode}')) + # self.stdout.write(self.style.SUCCESS('**************************************')) + self.stdout.write(self.style.SUCCESS('')) + + client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017) + db = client.admin + + languages = {l.short_name: l for l in Language.objects.all()} + + bulk_update_bucket = [] + counter = Counter() + + for product in db.products.find( + { + 'lang': { + "$in": list(languages.keys()) + }, + 'completeness': { + "$gt": self.completeness + } + } + ): + + try: + ingredient_data = extract_info_from_off(product, languages[product['lang']]) + except KeyError as e: + self.stdout.write('--> KeyError while extracting info from OFF', e) + counter['skipped'] += 1 + continue + + # Some products have no name or name is too long, skipping + if not ingredient_data['name']: + counter['skipped'] += 1 + continue + + if not ingredient_data['common_name']: + counter['skipped'] += 1 + continue + + # + # Add entries as new products + if self.mode == Mode.INSERT: + bulk_update_bucket.append(Ingredient(**ingredient_data)) + if len(bulk_update_bucket) > self.bulk_size: + try: + Ingredient.objects.bulk_create(bulk_update_bucket) + self.stdout.write('***** Bulk adding products *****') + except Exception as e: + self.stdout.write( + '--> Error while saving the product bucket. Saving individually' + ) + self.stdout.write(e) + + # Try saving the ingredients individually as most will be correct + for ingredient in bulk_update_bucket: + try: + ingredient.save() + + # ¯\_(ツ)_/¯ + except Exception as e: + self.stdout.write('--> Error while saving the product individually') + self.stdout.write(e) + + counter['new'] += self.bulk_size + bulk_update_bucket = [] + + # Update existing entries + else: + try: + + # Update an existing product (look-up key is the code) or create a new + # one. While this might not be the most efficient query (there will always + # be a SELECT first), it's ok because this script is run very rarely. + obj, created = Ingredient.objects.update_or_create( + code=ingredient_data['code'], defaults=ingredient_data + ) + + if created: + counter['new'] += 1 + # self.stdout.write('-> added to the database') + else: + counter['edited'] += 1 + # self.stdout.write('-> updated') + + except Exception as e: + self.stdout.write('--> Error while performing update_or_create') + self.stdout.write(e) + counter['error'] += 1 + continue + + self.stdout.write(self.style.SUCCESS('**************************************')) + self.stdout.write(self.style.SUCCESS('Finished!')) + self.stdout.write(self.style.SUCCESS(str(counter))) + self.stdout.write(self.style.SUCCESS('**************************************'))