Simplify OFF import script

This now is automatically setup with docker compose, also the script has
been moved to a management command, which is cleaner
This commit is contained in:
Roland Geider
2023-11-25 10:57:46 +01:00
parent 21697d61d3
commit 0515e03647
7 changed files with 267 additions and 236 deletions

3
.gitignore vendored
View File

@@ -75,3 +75,6 @@ venv-wger
/wger/app_en.arb
/coverage.lcov
/media/
/static/
/extras/docker/open-food-facts/dump/*.tar.gz
/extras/docker/open-food-facts/dump/off/*

View File

@@ -0,0 +1,52 @@
# Import Open Food Facts products
This docker compose helps import or update products from the Open Food Facts
database into wger.
Note that the OFF database dump is very large, and you will need several times
this size available on your computer (tar.gz-file, extracted dump, mongo).
## 1
Download a current dump of their database
```shell
cd dump
wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
tar xzvf openfoodfacts-mongodbdump.tar.gz
```
## 2
Import the data into mongo.
Note that we are running this as a manual step since the import takes a while
```shell
docker compose up
docker compose exec mongorestore --username off --password off-wger -d admin -c products /dump/off/products.bson
```
There is an admin interface available at <http://localhost:80801>, log in with
these credentials:
* admin
* pass
## 3
Run the import script
```shell
python manage.py import-off-products
```
## 4
Don't forget to delete the dump and remove the containers if you love your
hard disk
```shell
docker compose down
rm dump -r openfoodfacts-mongodbdump.tar.gz dump/off
```

View File

@@ -0,0 +1,27 @@
#
# Please consult the `Deployment` section in the readme if you want to deploy
# this. You *need* to keep this nginx service, even if you have your own,
# otherwise the static files will not be served correctly! If you do remove
# it, configure yours similarly to what's in config/nginx.conf
# Also take a look at the "Static files" section in the .env file
services:
mongodb:
image: mongo
ports:
- "27017:27017"
volumes:
- $PWD/dump:/dump
environment:
MONGO_INITDB_ROOT_USERNAME: off
MONGO_INITDB_ROOT_PASSWORD: off-wger
mongo-express:
image: mongo-express
restart: always
ports:
- 8081:8081
environment:
ME_CONFIG_MONGODB_ADMINUSERNAME: off
ME_CONFIG_MONGODB_ADMINPASSWORD: off-wger
ME_CONFIG_MONGODB_URL: mongodb://off:off-wger@mongodb:27017/

View File

@@ -1,234 +0,0 @@
# This file is part of wger Workout Manager.
#
# wger Workout Manager is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# wger Workout Manager is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
from collections import Counter
import enum
from pymongo import MongoClient
import os
import django
import sys
sys.path.insert(0, os.path.join('..', '..'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
django.setup()
from django.conf import settings # noqa: E402
from wger.nutrition.models import Ingredient # noqa: E402
from wger.nutrition.off import extract_info_from_off
from wger.core.models import Language # noqa: E402
"""
Simple script that imports and loads the Open Food Facts database into the
ingredients database.
NOTE: The file is VERY large (40 GB), so it takes a long time (> 3 hours) to
import the data and create all the ingredients.
* Requirements:
(note that the local mongo version needs to be compatible with the one used to
create the dump, otherwise the indices won't be compatible, it is best to use
a newer version than the one found in the ubuntu/debian repos)
- MongoDB
https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/
- Docker
snap install docker
pip3 install pymongo
apt-get install mongo-tools zip
* Steps:
wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
tar xzvf openfoodfacts-mongodbdump.tar.gz
# Import
docker pull mongo
docker run -it --name wger_mongo -p 27017:27017 -d \
-e MONGO_INITDB_ROOT_USERNAME=off \
-e MONGO_INITDB_ROOT_PASSWORD=off-wger \
mongo:latest
mongorestore --username off --password off-wger -d admin -c products dump/off/products.bson
# Process
python extras/scripts/create_ingredients_from_foodfacts.py
# Cleanup
docker stop wger_mongo
docker rm wger_mongo
rm openfoodfacts-mongodbdump.tar.gz
rm -r dump
# Update ingredient fixture
python3 manage.py dumpdata nutrition.ingredient > extras/scripts/data.json
cd extras/scripts/
python3 filter-fixtures.py
zip ingredients.json.zip ingredients.json
"""
client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
db = client.admin
# Mode for this script. When using 'insert', the script will bulk-insert the new
# ingredients, which is very efficient. Importing the whole database will require
# barely a minute. When using 'update', existing ingredients will be updated, which
# requires two queries per product.
class Mode(enum.Enum):
INSERT = enum.auto()
UPDATE = enum.auto()
MODE = Mode.UPDATE
BULK_SIZE = 500
# The completeness is a value between 0 and 1.1 and shows how much product information
# is in the open food facts DB
COMPLETENESS = 0.7
# Get some completeness statistics
# for lang in ['de', 'es', 'en']:
# count = db.products.count_documents({'lang': lang, 'completeness': {"$gt": COMPLETENESS}})
# # count = db.products.count_documents({'lang': lang, 'complete': 1})
# total = db.products.count_documents({'lang': lang})
# print(f'Lang {lang} has {count} completed products out of {total}')
# import sys
# sys.exit()
# Completed ingredients as of 2023-04-08
#
# Lang az has 0 completed products out of 40
# Lang id has 6 completed products out of 981
# Lang cs has 80 completed products out of 4654
# Lang de has 1025 completed products out of 165771
# Lang en has 1028 completed products out of 929003
# Lang es has 618 completed products out of 303937
# Lang eo has 0 completed products out of 9
# Lang fr has 7125 completed products out of 1160796
# Lang hr has 57 completed products out of 1699
# Lang it has 870 completed products out of 216046
# Lang nl has 71 completed products out of 11744
# Lang no has 4 completed products out of 261
# Lang pl has 48 completed products out of 7145
# Lang pt has 127 completed products out of 9802
# Lang sv has 308 completed products out of 4622
# Lang tr has 3 completed products out of 1296
# Lang el has 6 completed products out of 927
# Lang bg has 31 completed products out of 4524
# Lang ru has 23 completed products out of 11683
# Lang uk has 2 completed products out of 604
# Lang he has 0 completed products out of 365
# Lang ar has 1 completed products out of 3552
# Lang fa has 0 completed products out of 575
# Lang zh has 2 completed products out of 935
# 2023-11-22
# completeness > 0.7
# Lang de has 47605 completed products out of 195056
# Lang es has 23187 completed products out of 311365
# Lang en has 39555 completed products out of 979534
def main():
languages = {l.short_name: l for l in Language.objects.all()}
bulk_update_bucket = []
counter = Counter()
print('***********************************')
print(languages.keys())
print('***********************************')
for product in db.products.find({
'lang': {"$in": list(languages.keys())},
'completeness': {"$gt": COMPLETENESS}
}):
try:
ingredient_data = extract_info_from_off(product, languages[product['lang']])
except KeyError as e:
print('--> KeyError while extracting info from OFF', e)
counter['skipped'] += 1
continue
# Some products have no name or name is too long, skipping
if not ingredient_data['name']:
counter['skipped'] += 1
continue
if not ingredient_data['common_name']:
counter['skipped'] += 1
continue
#
# Add entries as new products
if MODE == Mode.INSERT:
bulk_update_bucket.append(Ingredient(**ingredient_data))
if len(bulk_update_bucket) > BULK_SIZE:
try:
Ingredient.objects.bulk_create(bulk_update_bucket)
print('***** Bulk adding products *****')
except Exception as e:
print('--> Error while saving the product bucket. Saving individually')
print(e)
# Try saving the ingredients individually as most will be correct
for ingredient in bulk_update_bucket:
try:
ingredient.save()
# ¯\_(ツ)_/¯
except Exception as e:
print('--> Error while saving the product individually')
print(e)
counter['new'] += BULK_SIZE
bulk_update_bucket = []
# Update existing entries
else:
try:
# Update an existing product (look-up key is the code) or create a new
# one. While this might not be the most efficient query (there will always
# be a SELECT first), it's ok because this script is run very rarely.
obj, created = Ingredient.objects.update_or_create(
code=ingredient_data['code'],
defaults=ingredient_data
)
if created:
counter['new'] += 1
# print('-> added to the database')
else:
counter['edited'] += 1
# print('-> updated')
except Exception as e:
print('--> Error while performing update_or_create')
print(f' ingredient: {ingredient_data["name"]}')
print(e)
print(ingredient_data)
counter['error'] += 1
continue
print('***********************************')
print(counter)
print('***********************************')
if __name__ == "__main__":
main()

View File

@@ -523,8 +523,8 @@ by the US Department of Agriculture. It is extremely complete, with around
"""
if (
not WeightEntry.objects.filter(user=self.user).exists() or (
datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date
> datetime.timedelta(days=3)
datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date >
datetime.timedelta(days=3)
)
):
entry = WeightEntry()

View File

@@ -0,0 +1,183 @@
# This file is part of wger Workout Manager.
#
# wger Workout Manager is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# wger Workout Manager is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# Standard Library
import enum
import logging
from collections import Counter
# Django
from django.core.management.base import BaseCommand
# wger
from wger.core.models import Language
from wger.nutrition.models import Ingredient
from wger.nutrition.off import extract_info_from_off
logger = logging.getLogger(__name__)
# Mode for this script. When using 'insert', the script will bulk-insert the new
# ingredients, which is very efficient. Importing the whole database will require
# barely a minute. When using 'update', existing ingredients will be updated, which
# requires two queries per product.
class Mode(enum.Enum):
INSERT = enum.auto()
UPDATE = enum.auto()
class Command(BaseCommand):
"""
Import an Open Food facts Dump
"""
mode = Mode.UPDATE
bulk_size = 500
completeness = 0.7
help = 'Import an Open Food Facts dump. Please consult extras/docker/open-food-facts'
def add_arguments(self, parser):
parser.add_argument(
'--set-mode',
action='store',
default=10,
dest='mode',
type=str,
help='Script mode, "insert" or "update". Insert will insert the ingredients as new '
'entries in the database, while update will try to update them if they are '
'already present. Deault: insert'
)
parser.add_argument(
'--completeness',
action='store',
default=0.7,
dest='completeness',
type=float,
help='Completeness threshold for importing the products. Products in OFF have '
'completeness score that ranges from 0 to 1.1'
)
def handle(self, **options):
try:
# Third Party
from pymongo import MongoClient
except ImportError:
self.stdout.write('Please install pymongo, `pip install pymongo`')
return
if options['mode'] == 'insert':
self.mode = Mode.INSERT
if options['completeness'] < 0 or options['completeness'] > 1.1:
self.stdout.write('Completeness must be between 0 and 1.1')
return
self.completeness = options['completeness']
self.stdout.write(self.style.SUCCESS('Importing entries from Open Food Facts'))
self.stdout.write(self.style.SUCCESS(f' - Completeness threshold: {self.completeness}'))
self.stdout.write(self.style.SUCCESS(f' - Mode: {self.mode}'))
# self.stdout.write(self.style.SUCCESS('**************************************'))
self.stdout.write(self.style.SUCCESS(''))
client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
db = client.admin
languages = {l.short_name: l for l in Language.objects.all()}
bulk_update_bucket = []
counter = Counter()
for product in db.products.find(
{
'lang': {
"$in": list(languages.keys())
},
'completeness': {
"$gt": self.completeness
}
}
):
try:
ingredient_data = extract_info_from_off(product, languages[product['lang']])
except KeyError as e:
self.stdout.write('--> KeyError while extracting info from OFF', e)
counter['skipped'] += 1
continue
# Some products have no name or name is too long, skipping
if not ingredient_data['name']:
counter['skipped'] += 1
continue
if not ingredient_data['common_name']:
counter['skipped'] += 1
continue
#
# Add entries as new products
if self.mode == Mode.INSERT:
bulk_update_bucket.append(Ingredient(**ingredient_data))
if len(bulk_update_bucket) > self.bulk_size:
try:
Ingredient.objects.bulk_create(bulk_update_bucket)
self.stdout.write('***** Bulk adding products *****')
except Exception as e:
self.stdout.write(
'--> Error while saving the product bucket. Saving individually'
)
self.stdout.write(e)
# Try saving the ingredients individually as most will be correct
for ingredient in bulk_update_bucket:
try:
ingredient.save()
# ¯\_(ツ)_/¯
except Exception as e:
self.stdout.write('--> Error while saving the product individually')
self.stdout.write(e)
counter['new'] += self.bulk_size
bulk_update_bucket = []
# Update existing entries
else:
try:
# Update an existing product (look-up key is the code) or create a new
# one. While this might not be the most efficient query (there will always
# be a SELECT first), it's ok because this script is run very rarely.
obj, created = Ingredient.objects.update_or_create(
code=ingredient_data['code'], defaults=ingredient_data
)
if created:
counter['new'] += 1
# self.stdout.write('-> added to the database')
else:
counter['edited'] += 1
# self.stdout.write('-> updated')
except Exception as e:
self.stdout.write('--> Error while performing update_or_create')
self.stdout.write(e)
counter['error'] += 1
continue
self.stdout.write(self.style.SUCCESS('**************************************'))
self.stdout.write(self.style.SUCCESS('Finished!'))
self.stdout.write(self.style.SUCCESS(str(counter)))
self.stdout.write(self.style.SUCCESS('**************************************'))