Simplify OFF import script

This now is automatically setup with docker compose, also the script has been moved to a management command, which is cleaner
2026-02-18 00:17:51 +01:00 · 2023-11-25 10:57:46 +01:00
parent 21697d61d3
commit 0515e03647
7 changed files with 267 additions and 236 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -75,3 +75,6 @@ venv-wger
 /wger/app_en.arb
 /coverage.lcov
 /media/
+/static/
+/extras/docker/open-food-facts/dump/*.tar.gz
+/extras/docker/open-food-facts/dump/off/*
--- a/extras/docker/open-food-facts/README.md
+++ b/extras/docker/open-food-facts/README.md
@@ -0,0 +1,52 @@
+# Import Open Food Facts products
+
+This docker compose helps import or update products from the Open Food Facts
+database into wger.
+
+Note that the OFF database dump is very large, and you will need several times
+this size available on your computer (tar.gz-file, extracted dump, mongo).
+
+## 1
+
+Download a current dump of their database
+
+```shell
+cd dump
+wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
+tar xzvf openfoodfacts-mongodbdump.tar.gz
+```
+
+## 2
+
+Import the data into mongo.
+
+Note that we are running this as a manual step since the import takes a while
+
+```shell
+docker compose up
+docker compose exec mongorestore --username off --password off-wger -d admin -c products /dump/off/products.bson
+```
+
+There is an admin interface available at <http://localhost:80801>, log in with
+these credentials:
+
+* admin
+* pass
+
+## 3
+
+Run the import script
+
+```shell
+python manage.py import-off-products
+```
+
+## 4
+
+Don't forget to delete the dump and remove the containers if you love your
+hard disk
+
+```shell
+docker compose down
+rm dump -r openfoodfacts-mongodbdump.tar.gz dump/off
+```
--- a/extras/docker/open-food-facts/docker-compose.yml
+++ b/extras/docker/open-food-facts/docker-compose.yml
@@ -0,0 +1,27 @@
+#
+# Please consult the `Deployment` section in the readme if you want to deploy
+# this. You *need* to keep this nginx service, even if you have your own,
+# otherwise the static files will not be served correctly! If you do remove
+# it, configure yours similarly to what's in config/nginx.conf
+# Also take a look at the "Static files" section in the .env file
+
+services:
+  mongodb:
+    image: mongo
+    ports:
+      - "27017:27017"
+    volumes:
+      - $PWD/dump:/dump
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: off
+      MONGO_INITDB_ROOT_PASSWORD: off-wger
+
+  mongo-express:
+    image: mongo-express
+    restart: always
+    ports:
+      - 8081:8081
+    environment:
+      ME_CONFIG_MONGODB_ADMINUSERNAME: off
+      ME_CONFIG_MONGODB_ADMINPASSWORD: off-wger
+      ME_CONFIG_MONGODB_URL: mongodb://off:off-wger@mongodb:27017/
--- a/extras/docker/open-food-facts/dump/.empty
+++ b/extras/docker/open-food-facts/dump/.empty
--- a/extras/scripts/create_ingredients_from_foodfacts.py
+++ b/extras/scripts/create_ingredients_from_foodfacts.py
@@ -1,234 +0,0 @@
-# This file is part of wger Workout Manager.
-#
-# wger Workout Manager is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# wger Workout Manager is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-
-from collections import Counter
-import enum
-
-from pymongo import MongoClient
-import os
-import django
-import sys
-
-sys.path.insert(0, os.path.join('..', '..'))
-os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
-django.setup()
-from django.conf import settings  # noqa: E402
-
-from wger.nutrition.models import Ingredient  # noqa: E402
-from wger.nutrition.off import extract_info_from_off
-from wger.core.models import Language  # noqa: E402
-
-"""
-Simple script that imports and loads the Open Food Facts database into the
-ingredients database.
-
-NOTE: The file is VERY large (40 GB), so it takes a long time (> 3 hours) to
-import the data and create all the ingredients.
-
-
-* Requirements:
- (note that the local mongo version needs to be compatible with the one used to
- create the dump, otherwise the indices won't be compatible, it is best to use
- a newer version than the one found in the ubuntu/debian repos)
-
- - MongoDB
- https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/
-
- - Docker
- snap install docker
-
-pip3 install pymongo
-apt-get install mongo-tools zip
-
-* Steps:
-wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
-tar xzvf openfoodfacts-mongodbdump.tar.gz
-
-# Import
-docker pull mongo
-docker run -it --name wger_mongo -p 27017:27017 -d  \
-    -e MONGO_INITDB_ROOT_USERNAME=off \
-    -e MONGO_INITDB_ROOT_PASSWORD=off-wger \
-    mongo:latest
-mongorestore --username off --password off-wger -d admin -c products dump/off/products.bson
-
-# Process
-python extras/scripts/create_ingredients_from_foodfacts.py
-
-# Cleanup
-docker stop wger_mongo
-docker rm wger_mongo
-rm openfoodfacts-mongodbdump.tar.gz
-rm -r dump
-
-# Update ingredient fixture
-python3 manage.py dumpdata nutrition.ingredient > extras/scripts/data.json
-cd extras/scripts/
-python3 filter-fixtures.py
-zip ingredients.json.zip ingredients.json
-"""
-
-client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
-db = client.admin
-
-
-# Mode for this script. When using 'insert', the script will bulk-insert the new
-# ingredients, which is very efficient. Importing the whole database will require
-# barely a minute. When using 'update', existing ingredients will be updated, which
-# requires two queries per product.
-class Mode(enum.Enum):
-    INSERT = enum.auto()
-    UPDATE = enum.auto()
-
-
-MODE = Mode.UPDATE
-BULK_SIZE = 500
-
-# The completeness is a value between 0 and 1.1 and shows how much product information
-# is in the open food facts DB
-COMPLETENESS = 0.7
-
-
-# Get some completeness statistics
-# for lang in ['de', 'es', 'en']:
-#     count = db.products.count_documents({'lang': lang, 'completeness': {"$gt": COMPLETENESS}})
-#    # count = db.products.count_documents({'lang': lang, 'complete': 1})
-#     total = db.products.count_documents({'lang': lang})
-#     print(f'Lang {lang} has {count} completed products out of {total}')
-# import sys
-# sys.exit()
-
-# Completed ingredients as of 2023-04-08
-#
-# Lang az has 0 completed products out of 40
-# Lang id has 6 completed products out of 981
-# Lang cs has 80 completed products out of 4654
-# Lang de has 1025 completed products out of 165771
-# Lang en has 1028 completed products out of 929003
-# Lang es has 618 completed products out of 303937
-# Lang eo has 0 completed products out of 9
-# Lang fr has 7125 completed products out of 1160796
-# Lang hr has 57 completed products out of 1699
-# Lang it has 870 completed products out of 216046
-# Lang nl has 71 completed products out of 11744
-# Lang no has 4 completed products out of 261
-# Lang pl has 48 completed products out of 7145
-# Lang pt has 127 completed products out of 9802
-# Lang sv has 308 completed products out of 4622
-# Lang tr has 3 completed products out of 1296
-# Lang el has 6 completed products out of 927
-# Lang bg has 31 completed products out of 4524
-# Lang ru has 23 completed products out of 11683
-# Lang uk has 2 completed products out of 604
-# Lang he has 0 completed products out of 365
-# Lang ar has 1 completed products out of 3552
-# Lang fa has 0 completed products out of 575
-# Lang zh has 2 completed products out of 935
-
-# 2023-11-22
-# completeness > 0.7
-# Lang de has 47605 completed products out of 195056
-# Lang es has 23187 completed products out of 311365
-# Lang en has 39555 completed products out of 979534
-
-def main():
-    languages = {l.short_name: l for l in Language.objects.all()}
-
-    bulk_update_bucket = []
-    counter = Counter()
-
-    print('***********************************')
-    print(languages.keys())
-    print('***********************************')
-
-    for product in db.products.find({
-        'lang': {"$in": list(languages.keys())},
-        'completeness': {"$gt": COMPLETENESS}
-    }):
-
-        try:
-            ingredient_data = extract_info_from_off(product, languages[product['lang']])
-        except KeyError as e:
-            print('--> KeyError while extracting info from OFF', e)
-            counter['skipped'] += 1
-            continue
-
-        # Some products have no name or name is too long, skipping
-        if not ingredient_data['name']:
-            counter['skipped'] += 1
-            continue
-
-        if not ingredient_data['common_name']:
-            counter['skipped'] += 1
-            continue
-
-        #
-        # Add entries as new products
-        if MODE == Mode.INSERT:
-            bulk_update_bucket.append(Ingredient(**ingredient_data))
-            if len(bulk_update_bucket) > BULK_SIZE:
-                try:
-                    Ingredient.objects.bulk_create(bulk_update_bucket)
-                    print('***** Bulk adding products *****')
-                except Exception as e:
-                    print('--> Error while saving the product bucket. Saving individually')
-                    print(e)
-
-                    # Try saving the ingredients individually as most will be correct
-                    for ingredient in bulk_update_bucket:
-                        try:
-                            ingredient.save()
-
-                        # ¯\_(ツ)_/¯
-                        except Exception as e:
-                            print('--> Error while saving the product individually')
-                            print(e)
-
-                counter['new'] += BULK_SIZE
-                bulk_update_bucket = []
-
-        # Update existing entries
-        else:
-            try:
-
-                # Update an existing product (look-up key is the code) or create a new
-                # one. While this might not be the most efficient query (there will always
-                # be a SELECT first), it's ok because this script is run very rarely.
-                obj, created = Ingredient.objects.update_or_create(
-                    code=ingredient_data['code'],
-                    defaults=ingredient_data
-                )
-
-                if created:
-                    counter['new'] += 1
-                    # print('-> added to the database')
-                else:
-                    counter['edited'] += 1
-                    # print('-> updated')
-
-            except Exception as e:
-                print('--> Error while performing update_or_create')
-                print(f'  ingredient: {ingredient_data["name"]}')
-                print(e)
-                print(ingredient_data)
-                counter['error'] += 1
-                continue
-
-    print('***********************************')
-    print(counter)
-    print('***********************************')
-
-
-if __name__ == "__main__":
-    main()
--- a/wger/core/models/profile.py
+++ b/wger/core/models/profile.py
@@ -523,8 +523,8 @@ by the US Department of Agriculture. It is extremely complete, with around
        """
        if (
            not WeightEntry.objects.filter(user=self.user).exists() or (
-                datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date
-                > datetime.timedelta(days=3)
+                datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date >
+                datetime.timedelta(days=3)
            )
        ):
            entry = WeightEntry()
--- a/wger/nutrition/management/commands/import-off-products.py
+++ b/wger/nutrition/management/commands/import-off-products.py
@@ -0,0 +1,183 @@
+# This file is part of wger Workout Manager.
+#
+# wger Workout Manager is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# wger Workout Manager is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+
+# Standard Library
+import enum
+import logging
+from collections import Counter
+
+# Django
+from django.core.management.base import BaseCommand
+
+# wger
+from wger.core.models import Language
+from wger.nutrition.models import Ingredient
+from wger.nutrition.off import extract_info_from_off
+
+
+logger = logging.getLogger(__name__)
+
+
+# Mode for this script. When using 'insert', the script will bulk-insert the new
+# ingredients, which is very efficient. Importing the whole database will require
+# barely a minute. When using 'update', existing ingredients will be updated, which
+# requires two queries per product.
+class Mode(enum.Enum):
+    INSERT = enum.auto()
+    UPDATE = enum.auto()
+
+
+class Command(BaseCommand):
+    """
+    Import an Open Food facts Dump
+    """
+    mode = Mode.UPDATE
+    bulk_size = 500
+    completeness = 0.7
+
+    help = 'Import an Open Food Facts dump. Please consult extras/docker/open-food-facts'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--set-mode',
+            action='store',
+            default=10,
+            dest='mode',
+            type=str,
+            help='Script mode, "insert" or "update". Insert will insert the ingredients as new '
+            'entries in the database, while update will try to update them if they are '
+            'already present. Deault: insert'
+        )
+        parser.add_argument(
+            '--completeness',
+            action='store',
+            default=0.7,
+            dest='completeness',
+            type=float,
+            help='Completeness threshold for importing the products. Products in OFF have '
+            'completeness score that ranges from 0 to 1.1'
+        )
+
+    def handle(self, **options):
+        try:
+            # Third Party
+            from pymongo import MongoClient
+        except ImportError:
+            self.stdout.write('Please install pymongo, `pip install pymongo`')
+            return
+
+        if options['mode'] == 'insert':
+            self.mode = Mode.INSERT
+
+        if options['completeness'] < 0 or options['completeness'] > 1.1:
+            self.stdout.write('Completeness must be between 0 and 1.1')
+            return
+        self.completeness = options['completeness']
+
+        self.stdout.write(self.style.SUCCESS('Importing entries from Open Food Facts'))
+        self.stdout.write(self.style.SUCCESS(f' - Completeness threshold: {self.completeness}'))
+        self.stdout.write(self.style.SUCCESS(f' - Mode: {self.mode}'))
+        # self.stdout.write(self.style.SUCCESS('**************************************'))
+        self.stdout.write(self.style.SUCCESS(''))
+
+        client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
+        db = client.admin
+
+        languages = {l.short_name: l for l in Language.objects.all()}
+
+        bulk_update_bucket = []
+        counter = Counter()
+
+        for product in db.products.find(
+            {
+                'lang': {
+                    "$in": list(languages.keys())
+                },
+                'completeness': {
+                    "$gt": self.completeness
+                }
+            }
+        ):
+
+            try:
+                ingredient_data = extract_info_from_off(product, languages[product['lang']])
+            except KeyError as e:
+                self.stdout.write('--> KeyError while extracting info from OFF', e)
+                counter['skipped'] += 1
+                continue
+
+            # Some products have no name or name is too long, skipping
+            if not ingredient_data['name']:
+                counter['skipped'] += 1
+                continue
+
+            if not ingredient_data['common_name']:
+                counter['skipped'] += 1
+                continue
+
+            #
+            # Add entries as new products
+            if self.mode == Mode.INSERT:
+                bulk_update_bucket.append(Ingredient(**ingredient_data))
+                if len(bulk_update_bucket) > self.bulk_size:
+                    try:
+                        Ingredient.objects.bulk_create(bulk_update_bucket)
+                        self.stdout.write('***** Bulk adding products *****')
+                    except Exception as e:
+                        self.stdout.write(
+                            '--> Error while saving the product bucket. Saving individually'
+                        )
+                        self.stdout.write(e)
+
+                        # Try saving the ingredients individually as most will be correct
+                        for ingredient in bulk_update_bucket:
+                            try:
+                                ingredient.save()
+
+                            # ¯\_(ツ)_/¯
+                            except Exception as e:
+                                self.stdout.write('--> Error while saving the product individually')
+                                self.stdout.write(e)
+
+                    counter['new'] += self.bulk_size
+                    bulk_update_bucket = []
+
+            # Update existing entries
+            else:
+                try:
+
+                    # Update an existing product (look-up key is the code) or create a new
+                    # one. While this might not be the most efficient query (there will always
+                    # be a SELECT first), it's ok because this script is run very rarely.
+                    obj, created = Ingredient.objects.update_or_create(
+                        code=ingredient_data['code'], defaults=ingredient_data
+                    )
+
+                    if created:
+                        counter['new'] += 1
+                        # self.stdout.write('-> added to the database')
+                    else:
+                        counter['edited'] += 1
+                        # self.stdout.write('-> updated')
+
+                except Exception as e:
+                    self.stdout.write('--> Error while performing update_or_create')
+                    self.stdout.write(e)
+                    counter['error'] += 1
+                    continue
+
+        self.stdout.write(self.style.SUCCESS('**************************************'))
+        self.stdout.write(self.style.SUCCESS('Finished!'))
+        self.stdout.write(self.style.SUCCESS(str(counter)))
+        self.stdout.write(self.style.SUCCESS('**************************************'))