| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- #!/bin/bash
- # $Id$
- # $Author$
- # $log$
- #ident "@(#)LocalFoodAI:data_sync.sh:$Format:%D:%ci:%cN:%h$"
- # data_sync.sh - Automated Data Freshness Pipeline
- # --- Auto-Detach & Sudo Auth Block ---
- if [ "$1" = "--detached" ]; then
- shift # Remove --detached from arguments for normal parsing
- else
- echo "Preparing to run ingestion in the background to survive SSH disconnections."
- echo "Please provide your sudo password to authorize the background task:"
- sudo -v # Authenticate interactively upfront
- if [ $? -ne 0 ]; then
- echo "Authentication failed. Exiting."
- exit 1
- fi
- echo "Authentication successful! Detaching process..."
- nohup sudo "$0" --detached "$@" > data_sync.log 2>&1 < /dev/null &
- echo "Process successfully detached! You can now safely close your SSH connection."
- echo "To monitor progress at any time, type: tail -f data_sync.log"
- exit 0
- fi
- # -------------------------------------
- ONLINE_MODE=0
- DATA_DIR="./data"
- INGEST_FILE="en.openfoodfacts.org.products.csv"
- URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv"
- # Parse arguments
- while [[ "$#" -gt 0 ]]; do
- case $1 in
- --online) ONLINE_MODE=1 ;;
- *) echo "Unknown parameter: $1"; exit 1 ;;
- esac
- shift
- done
- echo "Starting Data Freshness Sync..."
- mkdir -p "$DATA_DIR"
- if [ "$ONLINE_MODE" -eq 1 ]; then
- echo "Online mode enabled. Checking for latest OpenFoodFacts database..."
- if ping -c 1 google.com &> /dev/null; then
- echo "Internet connection verified. Downloading latest dataset..."
- # Use -N to only download if newer than local file
- wget -N -P "$DATA_DIR" "$URL"
- if [ $? -eq 0 ]; then
- echo "Download complete."
- else
- echo "Failed to download dataset."
- fi
- else
- echo "No internet access detected. Falling back to offline mode."
- fi
- else
- echo "Offline mode. Checking $DATA_DIR for manually dropped files..."
- fi
- # Check if file exists to trigger ingestion
- if [ -f "$DATA_DIR/$INGEST_FILE" ]; then
- # We should only ingest if the file is new or modified.
- # For simplicity, we just trigger ingestion if the file exists.
- # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it.
- echo "Found dataset: $DATA_DIR/$INGEST_FILE"
- echo "Triggering ingestion pipeline via Docker Compose..."
- sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE"
-
- # After successful ingestion, move or rename to prevent infinite loops on offline cron
- mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed"
- echo "Ingestion complete and file archived."
- else
- echo "No dataset found in $DATA_DIR. Nothing to ingest."
- fi
|