data_sync.sh 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. #!/bin/bash
  2. # data_sync.sh - Automated Data Freshness Pipeline
  3. ONLINE_MODE=0
  4. DATA_DIR="./data"
  5. INGEST_FILE="en.openfoodfacts.org.products.csv"
  6. URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
  7. # Parse arguments
  8. while [[ "$#" -gt 0 ]]; do
  9. case $1 in
  10. --online) ONLINE_MODE=1 ;;
  11. *) echo "Unknown parameter: $1"; exit 1 ;;
  12. esac
  13. shift
  14. done
  15. echo "Starting Data Freshness Sync..."
  16. mkdir -p "$DATA_DIR"
  17. if [ "$ONLINE_MODE" -eq 1 ]; then
  18. echo "Online mode enabled. Checking for latest OpenFoodFacts database..."
  19. if ping -c 1 google.com &> /dev/null; then
  20. echo "Internet connection verified. Downloading latest dataset..."
  21. # Use -N to only download if newer than local file
  22. wget -N -P "$DATA_DIR" "$URL"
  23. if [ $? -eq 0 ]; then
  24. echo "Download complete."
  25. else
  26. echo "Failed to download dataset."
  27. fi
  28. else
  29. echo "No internet access detected. Falling back to offline mode."
  30. fi
  31. else
  32. echo "Offline mode. Checking $DATA_DIR for manually dropped files..."
  33. fi
  34. # Check if file exists to trigger ingestion
  35. if [ -f "$DATA_DIR/$INGEST_FILE" ]; then
  36. # We should only ingest if the file is new or modified.
  37. # For simplicity, we just trigger ingestion if the file exists.
  38. # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it.
  39. echo "Found dataset: $DATA_DIR/$INGEST_FILE"
  40. echo "Triggering ingestion pipeline via Docker Compose..."
  41. sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE"
  42. # After successful ingestion, move or rename to prevent infinite loops on offline cron
  43. mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed"
  44. echo "Ingestion complete and file archived."
  45. else
  46. echo "No dataset found in $DATA_DIR. Nothing to ingest."
  47. fi