data_sync.sh 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/bin/bash
  2. # $Id$
  3. # $Author$
  4. # $log$
  5. #ident "@(#)LocalFoodAI:data_sync.sh:$Format:%D:%ci:%cN:%h$"
  6. # data_sync.sh - Automated Data Freshness Pipeline
  7. # --- Auto-Detach & Sudo Auth Block ---
  8. if [ "$1" = "--detached" ]; then
  9. shift # Remove --detached from arguments for normal parsing
  10. else
  11. echo "Preparing to run ingestion in the background to survive SSH disconnections."
  12. echo "Please provide your sudo password to authorize the background task:"
  13. sudo -v # Authenticate interactively upfront
  14. if [ $? -ne 0 ]; then
  15. echo "Authentication failed. Exiting."
  16. exit 1
  17. fi
  18. echo "Authentication successful! Detaching process..."
  19. nohup sudo "$0" --detached "$@" > data_sync.log 2>&1 < /dev/null &
  20. echo "Process successfully detached! You can now safely close your SSH connection."
  21. echo "To monitor progress at any time, type: tail -f data_sync.log"
  22. exit 0
  23. fi
  24. # -------------------------------------
  25. ONLINE_MODE=0
  26. DATA_DIR="./data"
  27. INGEST_FILE="en.openfoodfacts.org.products.csv"
  28. URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
  29. # Parse arguments
  30. while [[ "$#" -gt 0 ]]; do
  31. case $1 in
  32. --online) ONLINE_MODE=1 ;;
  33. *) echo "Unknown parameter: $1"; exit 1 ;;
  34. esac
  35. shift
  36. done
  37. echo "Starting Data Freshness Sync..."
  38. mkdir -p "$DATA_DIR"
  39. if [ "$ONLINE_MODE" -eq 1 ]; then
  40. echo "Online mode enabled. Checking for latest OpenFoodFacts database..."
  41. if ping -c 1 google.com &> /dev/null; then
  42. echo "Internet connection verified. Downloading latest dataset..."
  43. # Use -N to only download if newer than local file
  44. wget -N -P "$DATA_DIR" "$URL"
  45. if [ $? -eq 0 ]; then
  46. echo "Download complete."
  47. else
  48. echo "Failed to download dataset."
  49. fi
  50. else
  51. echo "No internet access detected. Falling back to offline mode."
  52. fi
  53. else
  54. echo "Offline mode. Checking $DATA_DIR for manually dropped files..."
  55. fi
  56. # Check if file exists to trigger ingestion
  57. if [ -f "$DATA_DIR/$INGEST_FILE" ]; then
  58. # We should only ingest if the file is new or modified.
  59. # For simplicity, we just trigger ingestion if the file exists.
  60. # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it.
  61. echo "Found dataset: $DATA_DIR/$INGEST_FILE"
  62. echo "Triggering ingestion pipeline via Docker Compose..."
  63. sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE"
  64. # After successful ingestion, move or rename to prevent infinite loops on offline cron
  65. mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed"
  66. echo "Ingestion complete and file archived."
  67. else
  68. echo "No dataset found in $DATA_DIR. Nothing to ingest."
  69. fi