Răsfoiți Sursa

TG-2: Restructure schema for all CSV columns, async ingestion, and mail forwarding

lanfr144 3 săptămâni în urmă
părinte
comite
ab7e3b1d3a
6 a modificat fișierele cu 126 adăugiri și 36 ștergeri
  1. 1 1
      app.py
  2. 15 0
      check_projects.py
  3. 46 22
      ingest_csv.py
  4. 8 13
      setup_db.py
  5. 26 0
      setup_mail_forwarding.sh
  6. 30 0
      start_batch_ingest.sh

+ 1 - 1
app.py

@@ -128,7 +128,7 @@ with tab_chat:
         
         
         with st.spinner("Analyzing locally..."):
         with st.spinner("Analyzing locally..."):
             try:
             try:
-                response = ollama.chat(model='llama3', messages=[
+                response = ollama.chat(model='mistral', messages=[
                     {'role': 'system', 'content': sys_prompt},
                     {'role': 'system', 'content': sys_prompt},
                     {'role': 'user', 'content': prompt}
                     {'role': 'user', 'content': prompt}
                 ])
                 ])

+ 15 - 0
check_projects.py

@@ -0,0 +1,15 @@
+import requests
+import urllib3
+urllib3.disable_warnings()
+
+auth = requests.post(
+    'https://192.168.130.161/taiga/api/v1/auth', 
+    json={'type': 'normal', 'username': 'FrancoisLange', 'password': 'BTSai123'}, 
+    verify=False
+).json()
+
+headers = {'Authorization': f'Bearer {auth["auth_token"]}'}
+projs = requests.get('https://192.168.130.161/taiga/api/v1/projects', headers=headers, verify=False).json()
+print("Projects:")
+for p in projs:
+    print(f"ID: {p['id']}, Name: {p['name']}, Slug: {p['slug']}")

+ 46 - 22
ingest_csv.py

@@ -25,42 +25,62 @@ def ingest_file(filename, engine):
         print(f"File {filename} not found locally.")
         print(f"File {filename} not found locally.")
         return False
         return False
         
         
-    print(f"\n🚀 Found {filename}! Starting ingestion via SQLAlchemy pipeline...")
-    
-    expected_columns = [
-        "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
-        "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
-        "brands", "categories", "origins", "labels", "stores", "countries", "ingredients_text", 
-        "allergens", "traces"
-    ]
+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion...")
     
     
     chunk_size = 5000 
     chunk_size = 5000 
     total_processed = 0
     total_processed = 0
 
 
-    for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
-        # Filter explicitly to schema
-        available_cols = [col for col in expected_columns if col in chunk.columns]
-        df = chunk[available_cols]
-        
-        # Pandas to_sql safely transforms NaNs to SQL NULLs internally
+    # Read dynamically without filtering. Setting low_memory=False to let pandas parse column types flexibly
+    for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False):
         try:
         try:
-            # We use 'append' because the products table already exists with primary keys
-            # To handle duplicate 'code' primary keys effortlessly, we drop duplicates from the dataframe before insert
-            # Or depend on PyMySQL. But pandas natively crashes on dupes unless managed. 
-            df = df.drop_duplicates(subset=['code'])
+            # Drop duplicates by code natively
+            if 'code' in chunk.columns:
+                df = chunk.drop_duplicates(subset=['code'])
+            else:
+                df = chunk
+                
             df.to_sql('products', con=engine, if_exists='append', index=False)
             df.to_sql('products', con=engine, if_exists='append', index=False)
             total_processed += len(df)
             total_processed += len(df)
-            print(f"   Successfully appended {total_processed} rows...")
+            print(f"   Successfully appended {total_processed} rows (Dynamic schema)...", end="\r")
         except BaseException as e:
         except BaseException as e:
-            # If a strict primary key duplicate existed in DB already from a previous chunk, ignore row crashes
             if "Duplicate entry" in str(e):
             if "Duplicate entry" in str(e):
                 pass
                 pass
             else:
             else:
-                 print(f"   [Warning] Chunk skipped due to internal structural error: {e}")
+                 print(f"\n   [Warning] Chunk skipped due to internal structural error: {e}")
         
         
-    print(f"✅ Finished importing {filename}.")
+    print(f"\n✅ Finished importing {filename}.")
     return True
     return True
 
 
+def create_indexes(engine):
+    print("\n🛠️ Creating performance indexes on newly generated table...")
+    # B-TREE and FULLTEXT INDEXES created post-ingestion for extreme speed
+    try:
+        with engine.begin() as connection:
+            print("  Building Primary Key on `code`...")
+            # We must make `code` the primary key if pandas just made it a TEXT field
+            # But MySQL cannot have a TEXT field as PRIMARY KEY without a length constraint.
+            # Convert code to VARCHAR(50) first.
+            connection.execute(urllib.parse.unquote("ALTER TABLE products MODIFY code VARCHAR(50);"))
+            connection.execute(urllib.parse.unquote("ALTER TABLE products ADD PRIMARY KEY (code);"))
+
+            print("  Building Fulltext Indexes...")
+            connection.execute(urllib.parse.unquote("CREATE FULLTEXT INDEX ft_idx_search ON products(product_name, ingredients_text, brands);"))
+            
+            print("  Building B-TREE Indexes on core macros...")
+            # We attempt to index key macros if they exist
+            macro_cols = ['energy-kcal_100g', 'fat_100g', 'carbohydrates_100g', 'proteins_100g']
+            for col in macro_cols:
+                # Convert TEXT to DOUBLE for numerical indexing and querying
+                # We catch errors if the column doesn't exist to be safe
+                try:
+                    connection.execute(urllib.parse.unquote(f"ALTER TABLE products MODIFY `{col}` DOUBLE;"))
+                    connection.execute(urllib.parse.unquote(f"CREATE INDEX idx_{col.replace('-', '_')} ON products(`{col}`);"))
+                except:
+                    pass
+        print("✅ Indexing Complete!")
+    except Exception as e:
+        print(f"❌ Indexing encountered an issue: {e}")
+
 if __name__ == "__main__":
 if __name__ == "__main__":
     print("Initiating OpenFoodFacts CSV Ingestion Process...")
     print("Initiating OpenFoodFacts CSV Ingestion Process...")
     engine = get_loader_engine()
     engine = get_loader_engine()
@@ -71,3 +91,7 @@ if __name__ == "__main__":
     if not processed_en and not processed_fr:
     if not processed_en and not processed_fr:
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("Please download them directly into the root folder and run this script again.")
         print("Please download them directly into the root folder and run this script again.")
+    else:
+        # Build indexes now that all data is appended!
+        create_indexes(engine)
+        print("\n🎉 Full database reload and indexing complete! Ready for AI RAG.")

+ 8 - 13
setup_db.py

@@ -70,22 +70,17 @@ def run_db_setup():
         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
         created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
     ) ENGINE=InnoDB;
     ) ENGINE=InnoDB;
     """)
     """)
-    # 2. Products Table
-    cursor.execute("""
-    CREATE TABLE IF NOT EXISTS food_db.products (
-        code VARCHAR(50) PRIMARY KEY, url TEXT, creator VARCHAR(255), created_t VARCHAR(50), 
-        created_datetime VARCHAR(50), last_modified_t VARCHAR(50), last_modified_datetime VARCHAR(50), 
-        product_name TEXT, generic_name TEXT, quantity VARCHAR(255), packaging TEXT, brands TEXT, 
-        categories TEXT, origins TEXT, labels TEXT, stores TEXT, countries TEXT, ingredients_text TEXT, 
-        allergens TEXT, traces TEXT, 
-        FULLTEXT INDEX ft_idx_search (product_name, ingredients_text)
-    ) ENGINE=InnoDB;
-    """)
+    # 2. Products Table (Dynamic Drop)
+    # We drop the strict schema completely. `ingest_csv.py` will use pandas to automatically 
+    # generate the table with 100% of the CSV columns dynamically defined as TEXT fields.
+    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
     
     
     # Table Context Grants (SoD)
     # Table Context Grants (SoD)
     cursor.execute("GRANT SELECT, INSERT, UPDATE ON food_db.users TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT, INSERT, UPDATE ON food_db.users TO 'db_app_auth'@'%';")
-    cursor.execute("GRANT SELECT ON food_db.products TO 'db_reader'@'%';")
-    cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE, DROP, CREATE ON food_db.products TO 'db_loader'@'%';")
+    # Note: Reader/Loader grants on products table will be handled or applied at the database level
+    # since the table won't exist until pandas creates it. Granting at db-level for these specific users.
+    cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")
+    cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE, DROP, CREATE, ALTER, INDEX ON food_db.* TO 'db_loader'@'%';")
     cursor.execute("FLUSH PRIVILEGES;")
     cursor.execute("FLUSH PRIVILEGES;")
 
 
     print("\n✅ Database, Users, and Tables created successfully!")
     print("\n✅ Database, Users, and Tables created successfully!")

+ 26 - 0
setup_mail_forwarding.sh

@@ -0,0 +1,26 @@
+#!/bin/bash
+# run this as root/sudo on the Ubuntu VM
+
+echo "Setting up centralized mail forwarding to lanfr144@gmail.com..."
+
+# 1. Update the skeleton directory so all NEW users created automatically forward mail
+echo "lanfr144@gmail.com" | sudo tee /etc/skel/.forward
+sudo chmod 644 /etc/skel/.forward
+
+# 2. Add forwarding to all dynamically created home directories
+for user_dir in /home/*; do
+  if [ -d "$user_dir" ]; then
+    user_name=$(basename "$user_dir")
+    echo "lanfr144@gmail.com" | sudo tee "$user_dir/.forward"
+    sudo chown "$user_name":"$user_name" "$user_dir/.forward"
+    sudo chmod 644 "$user_dir/.forward"
+    echo "Configured for user: $user_name"
+  fi
+done
+
+# 3. Add forwarding for root manually
+echo "lanfr144@gmail.com" | sudo tee /root/.forward
+sudo chmod 644 /root/.forward
+echo "Configured for root."
+
+echo "✅ All system mail will now forward to lanfr144@gmail.com"

+ 30 - 0
start_batch_ingest.sh

@@ -0,0 +1,30 @@
+#!/bin/bash
+# Local Food AI - Disconnected Ingestion Wrapper
+# This script uses nohup to run the python ingestion script in the background.
+# You can exit your SSH session safely after starting this script.
+
+echo "========================================================="
+echo "🍔 Local Food AI: Extreme Batch Ingestion"
+echo "========================================================="
+
+if [ ! -f "en.openfoodfacts.org.products.csv" ] && [ ! -f "fr.openfoodfacts.org.products.csv" ]; then
+    echo "❌ Error: CSV files not found in the current directory."
+    echo "Please download the massive CSVs before running this batch."
+    exit 1
+fi
+
+echo "🚀 Starting database wipe and reset..."
+# Automatically run the new DB setup to drop the rigid table
+python3 setup_db.py
+
+echo "🚀 Triggering background ingestion process via nohup..."
+echo "All outputs will be saved to ingestion_process.log"
+
+# Run securely in background
+nohup python3 -u ingest_csv.py > ingestion_process.log 2>&1 &
+BG_PID=$!
+
+echo "✅ Process started in the background (PID: $BG_PID)"
+echo "You can now safely close your terminal or turn off your computer."
+echo "To monitor progress from the server later, run:"
+echo "   tail -f ingestion_process.log"