data_sync.sh 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/bin/bash
  2. # $Id$
  3. # $Author$
  4. # $log$
  5. #ident "@(#)LocalFoodAI:data_sync.sh:$Format:%D:%ci:%cN:%h$"
  6. # data_sync.sh - Automated Data Freshness Pipeline
  7. ONLINE_MODE=0
  8. DATA_DIR="./data"
  9. INGEST_FILE="en.openfoodfacts.org.products.csv"
  10. URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
  11. # Parse arguments
  12. while [[ "$#" -gt 0 ]]; do
  13. case $1 in
  14. --online) ONLINE_MODE=1 ;;
  15. *) echo "Unknown parameter: $1"; exit 1 ;;
  16. esac
  17. shift
  18. done
  19. echo "Starting Data Freshness Sync..."
  20. mkdir -p "$DATA_DIR"
  21. if [ "$ONLINE_MODE" -eq 1 ]; then
  22. echo "Online mode enabled. Checking for latest OpenFoodFacts database..."
  23. if ping -c 1 google.com &> /dev/null; then
  24. echo "Internet connection verified. Downloading latest dataset..."
  25. # Use -N to only download if newer than local file
  26. wget -N -P "$DATA_DIR" "$URL"
  27. if [ $? -eq 0 ]; then
  28. echo "Download complete."
  29. else
  30. echo "Failed to download dataset."
  31. fi
  32. else
  33. echo "No internet access detected. Falling back to offline mode."
  34. fi
  35. else
  36. echo "Offline mode. Checking $DATA_DIR for manually dropped files..."
  37. fi
  38. # Check if file exists to trigger ingestion
  39. if [ -f "$DATA_DIR/$INGEST_FILE" ]; then
  40. # We should only ingest if the file is new or modified.
  41. # For simplicity, we just trigger ingestion if the file exists.
  42. # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it.
  43. echo "Found dataset: $DATA_DIR/$INGEST_FILE"
  44. echo "Triggering ingestion pipeline via Docker Compose..."
  45. sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE"
  46. # After successful ingestion, move or rename to prevent infinite loops on offline cron
  47. mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed"
  48. echo "Ingestion complete and file archived."
  49. else
  50. echo "No dataset found in $DATA_DIR. Nothing to ingest."
  51. fi