#!/bin/bash # $Id$ # $Author$ # $log$ #ident "@(#)LocalFoodAI:data_sync.sh:$Format:%D:%ci:%cN:%h$" # data_sync.sh - Automated Data Freshness Pipeline # --- Auto-Detach & Sudo Auth Block --- if [ "$1" = "--detached" ]; then shift # Remove --detached from arguments for normal parsing else echo "Preparing to run ingestion in the background to survive SSH disconnections." echo "Please provide your sudo password to authorize the background task:" sudo -v # Authenticate interactively upfront if [ $? -ne 0 ]; then echo "Authentication failed. Exiting." exit 1 fi echo "Authentication successful! Detaching process..." nohup sudo "$0" --detached "$@" > data_sync.log 2>&1 < /dev/null & echo "Process successfully detached! You can now safely close your SSH connection." echo "To monitor progress at any time, type: tail -f data_sync.log" exit 0 fi # ------------------------------------- ONLINE_MODE=0 DATA_DIR="./data" INGEST_FILE="en.openfoodfacts.org.products.csv" URL="https://static.openfoodfacts.org/data/en.openfoodfacts.org.products.csv" # Parse arguments while [[ "$#" -gt 0 ]]; do case $1 in --online) ONLINE_MODE=1 ;; *) echo "Unknown parameter: $1"; exit 1 ;; esac shift done echo "Starting Data Freshness Sync..." mkdir -p "$DATA_DIR" if [ "$ONLINE_MODE" -eq 1 ]; then echo "Online mode enabled. Checking for latest OpenFoodFacts database..." if ping -c 1 google.com &> /dev/null; then echo "Internet connection verified. Downloading latest dataset..." # Use -N to only download if newer than local file wget -N -P "$DATA_DIR" "$URL" if [ $? -eq 0 ]; then echo "Download complete." else echo "Failed to download dataset." fi else echo "No internet access detected. Falling back to offline mode." fi else echo "Offline mode. Checking $DATA_DIR for manually dropped files..." fi # Check if file exists to trigger ingestion if [ -f "$DATA_DIR/$INGEST_FILE" ]; then # We should only ingest if the file is new or modified. # For simplicity, we just trigger ingestion if the file exists. # Ingest script handles DROP TABLE if needed, but wait: ingest_csv appends by default or we can modify it. echo "Found dataset: $DATA_DIR/$INGEST_FILE" echo "Triggering ingestion pipeline via Docker Compose..." sudo docker-compose run --rm ingest ./ingest_csv.py "data/$INGEST_FILE" # After successful ingestion, move or rename to prevent infinite loops on offline cron mv "$DATA_DIR/$INGEST_FILE" "$DATA_DIR/$INGEST_FILE.processed" echo "Ingestion complete and file archived." else echo "No dataset found in $DATA_DIR. Nothing to ingest." fi