data_sync.sh 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. #!/bin/bash
  2. # $Id$
  3. # $Author$
  4. # $log$
  5. # data_sync.sh - Automated Data Freshness Pipeline
  6. ONLINE_MODE=0
  7. DATA_DIR="./data"
  8. INGEST_FILE="en.openfoodfacts.org.products.csv"
  9. URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
  10. # Parse arguments
  11. while [[ "$#" -gt 0 ]]; do
  12. case $1 in
  13. --online) ONLINE_MODE=1 ;;
  14. *) echo "Unknown parameter: $1"; exit 1 ;;
  15. esac
  16. shift
  17. done
  18. echo "Starting Data Freshness Sync..."
  19. mkdir -p "$DATA_DIR"
  20. if [ "$ONLINE_MODE" -eq 1 ]; then
  21. echo "Online mode enabled. Checking for latest OpenFoodFacts database..."
  22. if ping -c 1 google.com &> /dev/null; then
  23. echo "Internet connection verified. Downloading latest dataset..."
  24. # Use -N to only download if newer than local file
  25. wget -N -P "$DATA_DIR" "$URL"
  26. if [ $? -eq 0 ]; then
  27. echo "Download complete."
  28. else
  29. echo "Failed to download dataset."
  30. fi
  31. else
  32. echo "No internet access detected. Falling back to offline mode."
  33. fi
  34. else
  35. echo "Offline mode. Checking $DATA_DIR for manually dropped files..."
  36. fi
  37. # Check if file exists to trigger ingestion
  38. if [ -f "$DATA_DIR/$INGEST_FILE" ]; then
  39. # We should only ingest if the file is new or modified.
  40. # For simplicity, we just trigger ingestion if the file exists.
  41. # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it.
  42. echo "Found dataset: $DATA_DIR/$INGEST_FILE"
  43. echo "Triggering ingestion pipeline via Docker Compose..."
  44. sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE"
  45. # After successful ingestion, move or rename to prevent infinite loops on offline cron
  46. mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed"
  47. echo "Ingestion complete and file archived."
  48. else
  49. echo "No dataset found in $DATA_DIR. Nothing to ingest."
  50. fi