Ver Fonte

TG-2: Restructure schema for all CSV columns, async ingestion, and mail forwarding

lanfr144 há 3 semanas atrás
pai
commit
ab7e3b1d3a
6 ficheiros alterados com 126 adições e 36 exclusões
  1. 1 1
      app.py
  2. 15 0
      check_projects.py
  3. 46 22
      ingest_csv.py
  4. 8 13
      setup_db.py
  5. 26 0
      setup_mail_forwarding.sh
  6. 30 0
      start_batch_ingest.sh

+ 1 - 1
app.py

@@ -128,7 +128,7 @@ with tab_chat:
         
         with st.spinner("Analyzing locally..."):
             try:
-                response = ollama.chat(model='llama3', messages=[
+                response = ollama.chat(model='mistral', messages=[
                     {'role': 'system', 'content': sys_prompt},
                     {'role': 'user', 'content': prompt}
                 ])

+ 15 - 0
check_projects.py

@@ -0,0 +1,15 @@
+import requests
+import urllib3
+urllib3.disable_warnings()
+
+auth = requests.post(
+    'https://192.168.130.161/taiga/api/v1/auth', 
+    json={'type': 'normal', 'username': 'FrancoisLange', 'password': 'BTSai123'}, 
+    verify=False
+).json()
+
+headers = {'Authorization': f'Bearer {auth["auth_token"]}'}
+projs = requests.get('https://192.168.130.161/taiga/api/v1/projects', headers=headers, verify=False).json()
+print("Projects:")
+for p in projs:
+    print(f"ID: {p['id']}, Name: {p['name']}, Slug: {p['slug']}")

+ 46 - 22
ingest_csv.py

@@ -25,42 +25,62 @@ def ingest_file(filename, engine):
         print(f"File {filename} not found locally.")
         return False
         
-    print(f"\n🚀 Found {filename}! Starting ingestion via SQLAlchemy pipeline...")
-    
-    expected_columns = [
-        "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
-        "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
-        "brands", "categories", "origins", "labels", "stores", "countries", "ingredients_text", 
-        "allergens", "traces"
-    ]
+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion...")
     
     chunk_size = 5000 
     total_processed = 0
 
-    for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
-        # Filter explicitly to schema
-        available_cols = [col for col in expected_columns if col in chunk.columns]
-        df = chunk[available_cols]
-        
-        # Pandas to_sql safely transforms NaNs to SQL NULLs internally
+    # Read dynamically without filtering. Setting low_memory=False to let pandas parse column types flexibly
+    for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False):
         try:
-            # We use 'append' because the products table already exists with primary keys
-            # To handle duplicate 'code' primary keys effortlessly, we drop duplicates from the dataframe before insert
-            # Or depend on PyMySQL. But pandas natively crashes on dupes unless managed. 
-            df = df.drop_duplicates(subset=['code'])
+            # Drop duplicates by code natively
+            if 'code' in chunk.columns:
+                df = chunk.drop_duplicates(subset=['code'])
+            else:
+                df = chunk
+                
             df.to_sql('products', con=engine, if_exists='append', index=False)
             total_processed += len(df)
-            print(f"   Successfully appended {total_processed} rows...")
+            print(f"   Successfully appended {total_processed} rows (Dynamic schema)...", end="\r")
         except BaseException as e:
-            # If a strict primary key duplicate existed in DB already from a previous chunk, ignore row crashes
             if "Duplicate entry" in str(e):
                 pass
             else:
-                 print(f"   [Warning] Chunk skipped due to internal structural error: {e}")
+                 print(f"\n   [Warning] Chunk skipped due to internal structural error: {e}")
         
-    print(f"✅ Finished importing {filename}.")
+    print(f"\n✅ Finished importing {filename}.")
     return True
 
+def create_indexes(engine):
+    print("\n🛠️ Creating performance indexes on newly generated table...")
+    # B-TREE and FULLTEXT INDEXES created post-ingestion for extreme speed
+    try:
+        with engine.begin() as connection:
+            print("  Building Primary Key on `code`...")
+            # We must make `code` the primary key if pandas just made it a TEXT field
+            # But MySQL cannot have a TEXT field as PRIMARY KEY without a length constraint.
+            # Convert code to VARCHAR(50) first.
+            connection.execute(urllib.parse.unquote("ALTER TABLE products MODIFY code VARCHAR(50);"))
+            connection.execute(urllib.parse.unquote("ALTER TABLE products ADD PRIMARY KEY (code);"))
+
+            print("  Building Fulltext Indexes...")
+            connection.execute(urllib.parse.unquote("CREATE FULLTEXT INDEX ft_idx_search ON products(product_name, ingredients_text, brands);"))
+            
+            print("  Building B-TREE Indexes on core macros...")
+            # We attempt to index key macros if they exist
+            macro_cols = ['energy-kcal_100g', 'fat_100g', 'carbohydrates_100g', 'proteins_100g']
+            for col in macro_cols:
+                # Convert TEXT to DOUBLE for numerical indexing and querying
+                # We catch errors if the column doesn't exist to be safe
+                try:
+                    connection.execute(urllib.parse.unquote(f"ALTER TABLE products MODIFY `{col}` DOUBLE;"))
+                    connection.execute(urllib.parse.unquote(f"CREATE INDEX idx_{col.replace('-', '_')} ON products(`{col}`);"))
+                except:
+                    pass
+        print("✅ Indexing Complete!")
+    except Exception as e:
+        print(f"❌ Indexing encountered an issue: {e}")
+
 if __name__ == "__main__":
     print("Initiating OpenFoodFacts CSV Ingestion Process...")
     engine = get_loader_engine()
@@ -71,3 +91,7 @@ if __name__ == "__main__":
     if not processed_en and not processed_fr:
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("Please download them directly into the root folder and run this script again.")
+    else:
+        # Build indexes now that all data is appended!
+        create_indexes(engine)
+        print("\n🎉 Full database reload and indexing complete! Ready for AI RAG.")

+ 8 - 13
setup_db.py

@@ -70,22 +70,17 @@ def run_db_setup():
         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
     ) ENGINE=InnoDB;
     """)
-    # 2. Products Table
-    cursor.execute("""
-    CREATE TABLE IF NOT EXISTS food_db.products (
-        code VARCHAR(50) PRIMARY KEY, url TEXT, creator VARCHAR(255), created_t VARCHAR(50), 
-        created_datetime VARCHAR(50), last_modified_t VARCHAR(50), last_modified_datetime VARCHAR(50), 
-        product_name TEXT, generic_name TEXT, quantity VARCHAR(255), packaging TEXT, brands TEXT, 
-        categories TEXT, origins TEXT, labels TEXT, stores TEXT, countries TEXT, ingredients_text TEXT, 
-        allergens TEXT, traces TEXT, 
-        FULLTEXT INDEX ft_idx_search (product_name, ingredients_text)
-    ) ENGINE=InnoDB;
-    """)
+    # 2. Products Table (Dynamic Drop)
+    # We drop the strict schema completely. `ingest_csv.py` will use pandas to automatically 
+    # generate the table with 100% of the CSV columns dynamically defined as TEXT fields.
+    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
     
     # Table Context Grants (SoD)
     cursor.execute("GRANT SELECT, INSERT, UPDATE ON food_db.users TO 'db_app_auth'@'%';")
-    cursor.execute("GRANT SELECT ON food_db.products TO 'db_reader'@'%';")
-    cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE, DROP, CREATE ON food_db.products TO 'db_loader'@'%';")
+    # Note: Reader/Loader grants on products table will be handled or applied at the database level
+    # since the table won't exist until pandas creates it. Granting at db-level for these specific users.
+    cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")
+    cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE, DROP, CREATE, ALTER, INDEX ON food_db.* TO 'db_loader'@'%';")
     cursor.execute("FLUSH PRIVILEGES;")
 
     print("\n✅ Database, Users, and Tables created successfully!")

+ 26 - 0
setup_mail_forwarding.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+# run this as root/sudo on the Ubuntu VM
+
+echo "Setting up centralized mail forwarding to lanfr144@gmail.com..."
+
+# 1. Update the skeleton directory so all NEW users created automatically forward mail
+echo "lanfr144@gmail.com" | sudo tee /etc/skel/.forward
+sudo chmod 644 /etc/skel/.forward
+
+# 2. Add forwarding to all dynamically created home directories
+for user_dir in /home/*; do
+  if [ -d "$user_dir" ]; then
+    user_name=$(basename "$user_dir")
+    echo "lanfr144@gmail.com" | sudo tee "$user_dir/.forward"
+    sudo chown "$user_name":"$user_name" "$user_dir/.forward"
+    sudo chmod 644 "$user_dir/.forward"
+    echo "Configured for user: $user_name"
+  fi
+done
+
+# 3. Add forwarding for root manually
+echo "lanfr144@gmail.com" | sudo tee /root/.forward
+sudo chmod 644 /root/.forward
+echo "Configured for root."
+
+echo "✅ All system mail will now forward to lanfr144@gmail.com"

+ 30 - 0
start_batch_ingest.sh

@@ -0,0 +1,30 @@
+#!/bin/bash
+# Local Food AI - Disconnected Ingestion Wrapper
+# This script uses nohup to run the python ingestion script in the background.
+# You can exit your SSH session safely after starting this script.
+
+echo "========================================================="
+echo "🍔 Local Food AI: Extreme Batch Ingestion"
+echo "========================================================="
+
+if [ ! -f "en.openfoodfacts.org.products.csv" ] && [ ! -f "fr.openfoodfacts.org.products.csv" ]; then
+    echo "❌ Error: CSV files not found in the current directory."
+    echo "Please download the massive CSVs before running this batch."
+    exit 1
+fi
+
+echo "🚀 Starting database wipe and reset..."
+# Automatically run the new DB setup to drop the rigid table
+python3 setup_db.py
+
+echo "🚀 Triggering background ingestion process via nohup..."
+echo "All outputs will be saved to ingestion_process.log"
+
+# Run securely in background
+nohup python3 -u ingest_csv.py > ingestion_process.log 2>&1 &
+BG_PID=$!
+
+echo "✅ Process started in the background (PID: $BG_PID)"
+echo "You can now safely close your terminal or turn off your computer."
+echo "To monitor progress from the server later, run:"
+echo "   tail -f ingestion_process.log"