mirror of
https://github.com/wger-project/wger.git
synced 2026-02-18 00:17:51 +01:00
Simplify OFF import script
This now is automatically setup with docker compose, also the script has been moved to a management command, which is cleaner
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -75,3 +75,6 @@ venv-wger
|
||||
/wger/app_en.arb
|
||||
/coverage.lcov
|
||||
/media/
|
||||
/static/
|
||||
/extras/docker/open-food-facts/dump/*.tar.gz
|
||||
/extras/docker/open-food-facts/dump/off/*
|
||||
|
||||
52
extras/docker/open-food-facts/README.md
Normal file
52
extras/docker/open-food-facts/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Import Open Food Facts products
|
||||
|
||||
This docker compose helps import or update products from the Open Food Facts
|
||||
database into wger.
|
||||
|
||||
Note that the OFF database dump is very large, and you will need several times
|
||||
this size available on your computer (tar.gz-file, extracted dump, mongo).
|
||||
|
||||
## 1
|
||||
|
||||
Download a current dump of their database
|
||||
|
||||
```shell
|
||||
cd dump
|
||||
wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
|
||||
tar xzvf openfoodfacts-mongodbdump.tar.gz
|
||||
```
|
||||
|
||||
## 2
|
||||
|
||||
Import the data into mongo.
|
||||
|
||||
Note that we are running this as a manual step since the import takes a while
|
||||
|
||||
```shell
|
||||
docker compose up
|
||||
docker compose exec mongorestore --username off --password off-wger -d admin -c products /dump/off/products.bson
|
||||
```
|
||||
|
||||
There is an admin interface available at <http://localhost:80801>, log in with
|
||||
these credentials:
|
||||
|
||||
* admin
|
||||
* pass
|
||||
|
||||
## 3
|
||||
|
||||
Run the import script
|
||||
|
||||
```shell
|
||||
python manage.py import-off-products
|
||||
```
|
||||
|
||||
## 4
|
||||
|
||||
Don't forget to delete the dump and remove the containers if you love your
|
||||
hard disk
|
||||
|
||||
```shell
|
||||
docker compose down
|
||||
rm dump -r openfoodfacts-mongodbdump.tar.gz dump/off
|
||||
```
|
||||
27
extras/docker/open-food-facts/docker-compose.yml
Normal file
27
extras/docker/open-food-facts/docker-compose.yml
Normal file
@@ -0,0 +1,27 @@
|
||||
#
|
||||
# Please consult the `Deployment` section in the readme if you want to deploy
|
||||
# this. You *need* to keep this nginx service, even if you have your own,
|
||||
# otherwise the static files will not be served correctly! If you do remove
|
||||
# it, configure yours similarly to what's in config/nginx.conf
|
||||
# Also take a look at the "Static files" section in the .env file
|
||||
|
||||
services:
|
||||
mongodb:
|
||||
image: mongo
|
||||
ports:
|
||||
- "27017:27017"
|
||||
volumes:
|
||||
- $PWD/dump:/dump
|
||||
environment:
|
||||
MONGO_INITDB_ROOT_USERNAME: off
|
||||
MONGO_INITDB_ROOT_PASSWORD: off-wger
|
||||
|
||||
mongo-express:
|
||||
image: mongo-express
|
||||
restart: always
|
||||
ports:
|
||||
- 8081:8081
|
||||
environment:
|
||||
ME_CONFIG_MONGODB_ADMINUSERNAME: off
|
||||
ME_CONFIG_MONGODB_ADMINPASSWORD: off-wger
|
||||
ME_CONFIG_MONGODB_URL: mongodb://off:off-wger@mongodb:27017/
|
||||
0
extras/docker/open-food-facts/dump/.empty
Normal file
0
extras/docker/open-food-facts/dump/.empty
Normal file
@@ -1,234 +0,0 @@
|
||||
# This file is part of wger Workout Manager.
|
||||
#
|
||||
# wger Workout Manager is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# wger Workout Manager is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
|
||||
from collections import Counter
|
||||
import enum
|
||||
|
||||
from pymongo import MongoClient
|
||||
import os
|
||||
import django
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.join('..', '..'))
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
|
||||
django.setup()
|
||||
from django.conf import settings # noqa: E402
|
||||
|
||||
from wger.nutrition.models import Ingredient # noqa: E402
|
||||
from wger.nutrition.off import extract_info_from_off
|
||||
from wger.core.models import Language # noqa: E402
|
||||
|
||||
"""
|
||||
Simple script that imports and loads the Open Food Facts database into the
|
||||
ingredients database.
|
||||
|
||||
NOTE: The file is VERY large (40 GB), so it takes a long time (> 3 hours) to
|
||||
import the data and create all the ingredients.
|
||||
|
||||
|
||||
* Requirements:
|
||||
(note that the local mongo version needs to be compatible with the one used to
|
||||
create the dump, otherwise the indices won't be compatible, it is best to use
|
||||
a newer version than the one found in the ubuntu/debian repos)
|
||||
|
||||
- MongoDB
|
||||
https://www.mongodb.com/docs/manual/tutorial/install-mongodb-on-ubuntu/
|
||||
|
||||
- Docker
|
||||
snap install docker
|
||||
|
||||
pip3 install pymongo
|
||||
apt-get install mongo-tools zip
|
||||
|
||||
* Steps:
|
||||
wget https://static.openfoodfacts.org/data/openfoodfacts-mongodbdump.tar.gz
|
||||
tar xzvf openfoodfacts-mongodbdump.tar.gz
|
||||
|
||||
# Import
|
||||
docker pull mongo
|
||||
docker run -it --name wger_mongo -p 27017:27017 -d \
|
||||
-e MONGO_INITDB_ROOT_USERNAME=off \
|
||||
-e MONGO_INITDB_ROOT_PASSWORD=off-wger \
|
||||
mongo:latest
|
||||
mongorestore --username off --password off-wger -d admin -c products dump/off/products.bson
|
||||
|
||||
# Process
|
||||
python extras/scripts/create_ingredients_from_foodfacts.py
|
||||
|
||||
# Cleanup
|
||||
docker stop wger_mongo
|
||||
docker rm wger_mongo
|
||||
rm openfoodfacts-mongodbdump.tar.gz
|
||||
rm -r dump
|
||||
|
||||
# Update ingredient fixture
|
||||
python3 manage.py dumpdata nutrition.ingredient > extras/scripts/data.json
|
||||
cd extras/scripts/
|
||||
python3 filter-fixtures.py
|
||||
zip ingredients.json.zip ingredients.json
|
||||
"""
|
||||
|
||||
client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
|
||||
db = client.admin
|
||||
|
||||
|
||||
# Mode for this script. When using 'insert', the script will bulk-insert the new
|
||||
# ingredients, which is very efficient. Importing the whole database will require
|
||||
# barely a minute. When using 'update', existing ingredients will be updated, which
|
||||
# requires two queries per product.
|
||||
class Mode(enum.Enum):
|
||||
INSERT = enum.auto()
|
||||
UPDATE = enum.auto()
|
||||
|
||||
|
||||
MODE = Mode.UPDATE
|
||||
BULK_SIZE = 500
|
||||
|
||||
# The completeness is a value between 0 and 1.1 and shows how much product information
|
||||
# is in the open food facts DB
|
||||
COMPLETENESS = 0.7
|
||||
|
||||
|
||||
# Get some completeness statistics
|
||||
# for lang in ['de', 'es', 'en']:
|
||||
# count = db.products.count_documents({'lang': lang, 'completeness': {"$gt": COMPLETENESS}})
|
||||
# # count = db.products.count_documents({'lang': lang, 'complete': 1})
|
||||
# total = db.products.count_documents({'lang': lang})
|
||||
# print(f'Lang {lang} has {count} completed products out of {total}')
|
||||
# import sys
|
||||
# sys.exit()
|
||||
|
||||
# Completed ingredients as of 2023-04-08
|
||||
#
|
||||
# Lang az has 0 completed products out of 40
|
||||
# Lang id has 6 completed products out of 981
|
||||
# Lang cs has 80 completed products out of 4654
|
||||
# Lang de has 1025 completed products out of 165771
|
||||
# Lang en has 1028 completed products out of 929003
|
||||
# Lang es has 618 completed products out of 303937
|
||||
# Lang eo has 0 completed products out of 9
|
||||
# Lang fr has 7125 completed products out of 1160796
|
||||
# Lang hr has 57 completed products out of 1699
|
||||
# Lang it has 870 completed products out of 216046
|
||||
# Lang nl has 71 completed products out of 11744
|
||||
# Lang no has 4 completed products out of 261
|
||||
# Lang pl has 48 completed products out of 7145
|
||||
# Lang pt has 127 completed products out of 9802
|
||||
# Lang sv has 308 completed products out of 4622
|
||||
# Lang tr has 3 completed products out of 1296
|
||||
# Lang el has 6 completed products out of 927
|
||||
# Lang bg has 31 completed products out of 4524
|
||||
# Lang ru has 23 completed products out of 11683
|
||||
# Lang uk has 2 completed products out of 604
|
||||
# Lang he has 0 completed products out of 365
|
||||
# Lang ar has 1 completed products out of 3552
|
||||
# Lang fa has 0 completed products out of 575
|
||||
# Lang zh has 2 completed products out of 935
|
||||
|
||||
# 2023-11-22
|
||||
# completeness > 0.7
|
||||
# Lang de has 47605 completed products out of 195056
|
||||
# Lang es has 23187 completed products out of 311365
|
||||
# Lang en has 39555 completed products out of 979534
|
||||
|
||||
def main():
|
||||
languages = {l.short_name: l for l in Language.objects.all()}
|
||||
|
||||
bulk_update_bucket = []
|
||||
counter = Counter()
|
||||
|
||||
print('***********************************')
|
||||
print(languages.keys())
|
||||
print('***********************************')
|
||||
|
||||
for product in db.products.find({
|
||||
'lang': {"$in": list(languages.keys())},
|
||||
'completeness': {"$gt": COMPLETENESS}
|
||||
}):
|
||||
|
||||
try:
|
||||
ingredient_data = extract_info_from_off(product, languages[product['lang']])
|
||||
except KeyError as e:
|
||||
print('--> KeyError while extracting info from OFF', e)
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Some products have no name or name is too long, skipping
|
||||
if not ingredient_data['name']:
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
if not ingredient_data['common_name']:
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
#
|
||||
# Add entries as new products
|
||||
if MODE == Mode.INSERT:
|
||||
bulk_update_bucket.append(Ingredient(**ingredient_data))
|
||||
if len(bulk_update_bucket) > BULK_SIZE:
|
||||
try:
|
||||
Ingredient.objects.bulk_create(bulk_update_bucket)
|
||||
print('***** Bulk adding products *****')
|
||||
except Exception as e:
|
||||
print('--> Error while saving the product bucket. Saving individually')
|
||||
print(e)
|
||||
|
||||
# Try saving the ingredients individually as most will be correct
|
||||
for ingredient in bulk_update_bucket:
|
||||
try:
|
||||
ingredient.save()
|
||||
|
||||
# ¯\_(ツ)_/¯
|
||||
except Exception as e:
|
||||
print('--> Error while saving the product individually')
|
||||
print(e)
|
||||
|
||||
counter['new'] += BULK_SIZE
|
||||
bulk_update_bucket = []
|
||||
|
||||
# Update existing entries
|
||||
else:
|
||||
try:
|
||||
|
||||
# Update an existing product (look-up key is the code) or create a new
|
||||
# one. While this might not be the most efficient query (there will always
|
||||
# be a SELECT first), it's ok because this script is run very rarely.
|
||||
obj, created = Ingredient.objects.update_or_create(
|
||||
code=ingredient_data['code'],
|
||||
defaults=ingredient_data
|
||||
)
|
||||
|
||||
if created:
|
||||
counter['new'] += 1
|
||||
# print('-> added to the database')
|
||||
else:
|
||||
counter['edited'] += 1
|
||||
# print('-> updated')
|
||||
|
||||
except Exception as e:
|
||||
print('--> Error while performing update_or_create')
|
||||
print(f' ingredient: {ingredient_data["name"]}')
|
||||
print(e)
|
||||
print(ingredient_data)
|
||||
counter['error'] += 1
|
||||
continue
|
||||
|
||||
print('***********************************')
|
||||
print(counter)
|
||||
print('***********************************')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -523,8 +523,8 @@ by the US Department of Agriculture. It is extremely complete, with around
|
||||
"""
|
||||
if (
|
||||
not WeightEntry.objects.filter(user=self.user).exists() or (
|
||||
datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date
|
||||
> datetime.timedelta(days=3)
|
||||
datetime.date.today() - WeightEntry.objects.filter(user=self.user).latest().date >
|
||||
datetime.timedelta(days=3)
|
||||
)
|
||||
):
|
||||
entry = WeightEntry()
|
||||
|
||||
183
wger/nutrition/management/commands/import-off-products.py
Normal file
183
wger/nutrition/management/commands/import-off-products.py
Normal file
@@ -0,0 +1,183 @@
|
||||
# This file is part of wger Workout Manager.
|
||||
#
|
||||
# wger Workout Manager is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Affero General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# wger Workout Manager is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Affero General Public License
|
||||
|
||||
# Standard Library
|
||||
import enum
|
||||
import logging
|
||||
from collections import Counter
|
||||
|
||||
# Django
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
# wger
|
||||
from wger.core.models import Language
|
||||
from wger.nutrition.models import Ingredient
|
||||
from wger.nutrition.off import extract_info_from_off
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Mode for this script. When using 'insert', the script will bulk-insert the new
|
||||
# ingredients, which is very efficient. Importing the whole database will require
|
||||
# barely a minute. When using 'update', existing ingredients will be updated, which
|
||||
# requires two queries per product.
|
||||
class Mode(enum.Enum):
|
||||
INSERT = enum.auto()
|
||||
UPDATE = enum.auto()
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""
|
||||
Import an Open Food facts Dump
|
||||
"""
|
||||
mode = Mode.UPDATE
|
||||
bulk_size = 500
|
||||
completeness = 0.7
|
||||
|
||||
help = 'Import an Open Food Facts dump. Please consult extras/docker/open-food-facts'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--set-mode',
|
||||
action='store',
|
||||
default=10,
|
||||
dest='mode',
|
||||
type=str,
|
||||
help='Script mode, "insert" or "update". Insert will insert the ingredients as new '
|
||||
'entries in the database, while update will try to update them if they are '
|
||||
'already present. Deault: insert'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--completeness',
|
||||
action='store',
|
||||
default=0.7,
|
||||
dest='completeness',
|
||||
type=float,
|
||||
help='Completeness threshold for importing the products. Products in OFF have '
|
||||
'completeness score that ranges from 0 to 1.1'
|
||||
)
|
||||
|
||||
def handle(self, **options):
|
||||
try:
|
||||
# Third Party
|
||||
from pymongo import MongoClient
|
||||
except ImportError:
|
||||
self.stdout.write('Please install pymongo, `pip install pymongo`')
|
||||
return
|
||||
|
||||
if options['mode'] == 'insert':
|
||||
self.mode = Mode.INSERT
|
||||
|
||||
if options['completeness'] < 0 or options['completeness'] > 1.1:
|
||||
self.stdout.write('Completeness must be between 0 and 1.1')
|
||||
return
|
||||
self.completeness = options['completeness']
|
||||
|
||||
self.stdout.write(self.style.SUCCESS('Importing entries from Open Food Facts'))
|
||||
self.stdout.write(self.style.SUCCESS(f' - Completeness threshold: {self.completeness}'))
|
||||
self.stdout.write(self.style.SUCCESS(f' - Mode: {self.mode}'))
|
||||
# self.stdout.write(self.style.SUCCESS('**************************************'))
|
||||
self.stdout.write(self.style.SUCCESS(''))
|
||||
|
||||
client = MongoClient('mongodb://off:off-wger@127.0.0.1', port=27017)
|
||||
db = client.admin
|
||||
|
||||
languages = {l.short_name: l for l in Language.objects.all()}
|
||||
|
||||
bulk_update_bucket = []
|
||||
counter = Counter()
|
||||
|
||||
for product in db.products.find(
|
||||
{
|
||||
'lang': {
|
||||
"$in": list(languages.keys())
|
||||
},
|
||||
'completeness': {
|
||||
"$gt": self.completeness
|
||||
}
|
||||
}
|
||||
):
|
||||
|
||||
try:
|
||||
ingredient_data = extract_info_from_off(product, languages[product['lang']])
|
||||
except KeyError as e:
|
||||
self.stdout.write('--> KeyError while extracting info from OFF', e)
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Some products have no name or name is too long, skipping
|
||||
if not ingredient_data['name']:
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
if not ingredient_data['common_name']:
|
||||
counter['skipped'] += 1
|
||||
continue
|
||||
|
||||
#
|
||||
# Add entries as new products
|
||||
if self.mode == Mode.INSERT:
|
||||
bulk_update_bucket.append(Ingredient(**ingredient_data))
|
||||
if len(bulk_update_bucket) > self.bulk_size:
|
||||
try:
|
||||
Ingredient.objects.bulk_create(bulk_update_bucket)
|
||||
self.stdout.write('***** Bulk adding products *****')
|
||||
except Exception as e:
|
||||
self.stdout.write(
|
||||
'--> Error while saving the product bucket. Saving individually'
|
||||
)
|
||||
self.stdout.write(e)
|
||||
|
||||
# Try saving the ingredients individually as most will be correct
|
||||
for ingredient in bulk_update_bucket:
|
||||
try:
|
||||
ingredient.save()
|
||||
|
||||
# ¯\_(ツ)_/¯
|
||||
except Exception as e:
|
||||
self.stdout.write('--> Error while saving the product individually')
|
||||
self.stdout.write(e)
|
||||
|
||||
counter['new'] += self.bulk_size
|
||||
bulk_update_bucket = []
|
||||
|
||||
# Update existing entries
|
||||
else:
|
||||
try:
|
||||
|
||||
# Update an existing product (look-up key is the code) or create a new
|
||||
# one. While this might not be the most efficient query (there will always
|
||||
# be a SELECT first), it's ok because this script is run very rarely.
|
||||
obj, created = Ingredient.objects.update_or_create(
|
||||
code=ingredient_data['code'], defaults=ingredient_data
|
||||
)
|
||||
|
||||
if created:
|
||||
counter['new'] += 1
|
||||
# self.stdout.write('-> added to the database')
|
||||
else:
|
||||
counter['edited'] += 1
|
||||
# self.stdout.write('-> updated')
|
||||
|
||||
except Exception as e:
|
||||
self.stdout.write('--> Error while performing update_or_create')
|
||||
self.stdout.write(e)
|
||||
counter['error'] += 1
|
||||
continue
|
||||
|
||||
self.stdout.write(self.style.SUCCESS('**************************************'))
|
||||
self.stdout.write(self.style.SUCCESS('Finished!'))
|
||||
self.stdout.write(self.style.SUCCESS(str(counter)))
|
||||
self.stdout.write(self.style.SUCCESS('**************************************'))
|
||||
Reference in New Issue
Block a user