hai 2 semanas · cb8b0b645e
--- a/ingest_csv.py
+++ b/ingest_csv.py
@@ -25,86 +25,59 @@ def ingest_file(filename, engine):
 
															         print(f"File {filename} not found locally.")
														
 
															         return False
														
 
															-    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion...")
														
 
															+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion into unified table...")
														
 
															-    chunk_size = 5000 
														
 
															+    chunk_size = 10000 
														
 
															     total_processed = 0
														
 
															-    # Read dynamically without filtering. Setting low_memory=False to let pandas parse column types flexibly
														
 
															-    # Forced utf-8 encoding to prevent French accent corruption on Windows OS defaults
														
 
															+    required_columns = [
														
 
															+        'code', 'product_name', 'generic_name', 'brands', 'allergens', 'ingredients_text',
														
 
															+        'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'sodium_100g', 
														
 
															+        'energy-kcal_100g', 'vitamin-c_100g', 'iron_100g', 'calcium_100g'
														
 
															+    ]
														
 
															+
														
 
															     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
														
 
															         try:
														
 
															-            # Drop duplicates by code natively
														
 
															-            if 'code' in chunk.columns:
														
 
															-                df = chunk.drop_duplicates(subset=['code'])
														
 
															-            else:
														
 
															-                df = chunk
														
 
															-            # Eliminate completely empty columns to save storage
														
 
															-            df.dropna(axis=1, how='all', inplace=True)
														
 
															+            # Filter to only the columns that actually exist in this chunk and are in required_columns
														
 
															+            available_cols = [c for c in required_columns if c in chunk.columns]
														
 
															+            df = chunk[available_cols].copy()
														
 
															+            
														
 
															+            if 'code' not in df.columns:
														
 
															+                continue
														
 
															+
														
 
															+            # Drop missing codes and local duplicates
														
 
															+            df.dropna(subset=['code'], inplace=True)
														
 
															+            df.drop_duplicates(subset=['code'], inplace=True)
														
 
															-            # Segment the dataframe into chunks of 50 columns each to bypass InnoDB constraints
														
 
															-            cols = list(df.columns)
														
 
															-            if 'code' in cols: cols.remove('code')
														
 
															+            # Ensure all required columns exist in the dataframe (fill missing with None)
														
 
															+            for col in required_columns:
														
 
															+                if col not in df.columns:
														
 
															+                    df[col] = None
														
 
															+                    
														
 
															+            # Reorder columns to exactly match the target table schema
														
 
															+            df = df[required_columns]
														
 
															-            p_chunk_size = 4 # Extreme safe size for TEXT columns to stay under 8126 byte row limit
														
 
															-            chunks = [cols[i:i + p_chunk_size] for i in range(0, len(cols), p_chunk_size)]
														
 
															+            # Write chunk to a temporary table
														
 
															+            df.to_sql('temp_products', con=engine, if_exists='replace', index=False)
														
 
															+            
														
 
															+            # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
														
 
															+            with engine.begin() as connection:
														
 
															+                connection.execute(text("INSERT IGNORE INTO products SELECT * FROM temp_products"))
														
 
															-            for i, col_chunk in enumerate(chunks):
														
 
															-                table_name = f'products_{i+1}'
														
 
															-                df_slice = df[['code'] + col_chunk].copy()
														
 
															-                df_slice.to_sql(table_name, con=engine, if_exists='append', index=False)
														
 
															-
														
 
															             total_processed += len(df)
														
 
															-            print(f"   Successfully appended {total_processed} rows (Dynamic schema)...", end="\r")
														
 
															+            print(f"   Successfully appended {total_processed} rows into unified schema...", end="\r")
														
 
															         except BaseException as e:
														
 
															-            if "Duplicate entry" in str(e):
														
 
															-                pass
														
 
															-            else:
														
 
															-                 print(f"\n   [Warning] Chunk skipped due to error: {e}")
														
 
															+            print(f"\n   [Warning] Chunk skipped due to error: {e}")
														
 
															+            
														
 
															+    # Cleanup temp table
														
 
															+    with engine.begin() as connection:
														
 
															+        connection.execute(text("DROP TABLE IF EXISTS temp_products"))
														
 
															     print(f"\n✅ Finished importing {filename}.")
														
 
															     return True
														
 
															-def create_indexes(engine):
														
 
															-    # Determine how many tables were actually created
														
 
															-    num_tables = 0
														
 
															-    with engine.connect() as conn:
														
 
															-        res = conn.execute(text("SHOW TABLES LIKE 'products_%'"))
														
 
															-        num_tables = len(res.fetchall())
														
 
															-
														
 
															-    print(f"\n🛠️ Creating performance indexes on {num_tables} partition tables...")
														
 
															-    try:
														
 
															-        with engine.begin() as connection:
														
 
															-            # Enforce Primary Keys on ALL partitions
														
 
															-            for i in range(1, num_tables + 1):
														
 
															-                try:
														
 
															-                    connection.execute(text(f"ALTER TABLE products_{i} MODIFY code VARCHAR(50);"))
														
 
															-                    connection.execute(text(f"ALTER TABLE products_{i} ADD PRIMARY KEY (code);"))
														
 
															-                except: pass
														
 
															-
														
 
															-            print("  Building Global MySQL VIEW...")
														
 
															-            view_sql = f"CREATE VIEW products AS SELECT p1.* "
														
 
															-            joins = []
														
 
															-            for i in range(2, num_tables + 1):
														
 
															-                # Get columns for this table except 'code'
														
 
															-                cols_res = connection.execute(text(f"SHOW COLUMNS FROM products_{i}"))
														
 
															-                table_cols = [c[0] for c in cols_res.fetchall() if c[0] != 'code']
														
 
															-                if table_cols:
														
 
															-                    view_sql += ", " + ", ".join([f"p{i}.`{c}`" for c in table_cols])
														
 
															-                joins.append(f"LEFT JOIN products_{i} p{i} ON p1.code = p{i}.code")
														
 
															-            
														
 
															-            view_sql += " FROM products_1 p1 " + " ".join(joins)
														
 
															-            
														
 
															-            try:
														
 
															-                connection.execute(text(view_sql))
														
 
															-            except Exception as ev:
														
 
															-                print(f"  Warning: View creation failed: {ev}")
														
 
															-        print("✅ Indexing Complete!")
														
 
															-    except Exception as e:
														
 
															-        print(f"❌ Indexing encountered an issue: {e}")
														
 
															-
														
 
															 if __name__ == "__main__":
														
 
															-    print("Initiating OpenFoodFacts CSV Ingestion Process...")
														
 
															+    print("Initiating OpenFoodFacts CSV Unified Ingestion Process...")
														
 
															     engine = get_loader_engine()
														
 
															     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
														
@@ -114,6 +87,4 @@ if __name__ == "__main__":
 
															         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
														
 
															         print("Please download them directly into the root folder and run this script again.")
														
 
															     else:
														
 
															-        # Build indexes now that all data is appended!
														
 
															-        create_indexes(engine)
														
 
															-        print("\n🎉 Full database reload and indexing complete! Ready for AI RAG.")
														
 
															+        print("\n🎉 Full database reload complete! Ready for AI RAG.")
														
--- a/setup_db.py
+++ b/setup_db.py
@@ -125,10 +125,32 @@ def run_db_setup():
 
															     ) ENGINE=InnoDB;
														
 
															     """)
														
 
															-    # 4. Products Table (Dynamic Drop for partitioned logic)
														
 
															+    # 4. Products Table (Unified)
														
 
															     for i in range(1, 101): # Drop up to 100 partitions just in case
														
 
															         cursor.execute(f"DROP TABLE IF EXISTS food_db.products_{i};")
														
 
															     cursor.execute("DROP VIEW IF EXISTS food_db.products;")
														
 
															+    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
														
 
															+    
														
 
															+    cursor.execute("""
														
 
															+    CREATE TABLE IF NOT EXISTS food_db.products (
														
 
															+        code VARCHAR(50) PRIMARY KEY,
														
 
															+        product_name TEXT NULL,
														
 
															+        generic_name TEXT NULL,
														
 
															+        brands TEXT NULL,
														
 
															+        allergens TEXT NULL,
														
 
															+        ingredients_text TEXT NULL,
														
 
															+        proteins_100g DOUBLE NULL,
														
 
															+        fat_100g DOUBLE NULL,
														
 
															+        carbohydrates_100g DOUBLE NULL,
														
 
															+        sugars_100g DOUBLE NULL,
														
 
															+        sodium_100g DOUBLE NULL,
														
 
															+        `energy-kcal_100g` DOUBLE NULL,
														
 
															+        `vitamin-c_100g` DOUBLE NULL,
														
 
															+        iron_100g DOUBLE NULL,
														
 
															+        calcium_100g DOUBLE NULL,
														
 
															+        FULLTEXT idx_search (product_name, ingredients_text)
														
 
															+    ) ENGINE=InnoDB;
														
 
															+    """)
														
 
															     # Table Context Grants (PoLP)
														
 
															     # The authenticated app process can handle credentials and now read/write custom plates!