hace 2 semanas · 620543f87d
--- a/ingest_csv.py
+++ b/ingest_csv.py
@@ -2,6 +2,7 @@ import pandas as pd
 
				 import myloginpath
			
 
				 import urllib.parse
			
 
				 from sqlalchemy import create_engine, text
			
 
				+from sqlalchemy.types import VARCHAR, TEXT, DOUBLE
			
 
				 import os
			
 
				 import sys
			
 
				 
			
@@ -25,25 +26,15 @@ def ingest_file(filename, engine):
 
				         print(f"File {filename} not found locally.")
			
 
				         return False
			
 
				         
			
 
				-    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion into unified table...")
			
 
				+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion for ALL columns...")
			
 
				     
			
 
				     chunk_size = 10000 
			
 
				     total_processed = 0
			
 
				-
			
 
				-    required_columns = [
			
 
				-        'code', 'product_name', 'generic_name', 'brands', 'allergens', 'ingredients_text',
			
 
				-        'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'sodium_100g', 'salt_100g',
			
 
				-        'energy-kcal_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g',
			
 
				-        'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g',
			
 
				-        'vitamin-b9_100g', 'vitamin-b12_100g', 'calcium_100g', 'iron_100g', 'magnesium_100g',
			
 
				-        'zinc_100g', 'potassium_100g', 'cholesterol_100g', 'fiber_100g'
			
 
				-    ]
			
 
				+    is_first_chunk = True
			
 
				 
			
 
				     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
			
 
				         try:
			
 
				-            # Filter to only the columns that actually exist in this chunk and are in required_columns
			
 
				-            available_cols = [c for c in required_columns if c in chunk.columns]
			
 
				-            df = chunk[available_cols].copy()
			
 
				+            df = chunk.copy()
			
 
				             
			
 
				             if 'code' not in df.columns:
			
 
				                 continue
			
@@ -52,23 +43,32 @@ def ingest_file(filename, engine):
 
				             df.dropna(subset=['code'], inplace=True)
			
 
				             df.drop_duplicates(subset=['code'], inplace=True)
			
 
				             
			
 
				-            # Ensure all required columns exist in the dataframe (fill missing with None)
			
 
				-            for col in required_columns:
			
 
				-                if col not in df.columns:
			
 
				-                    df[col] = None
			
 
				-                    
			
 
				-            # Reorder columns to exactly match the target table schema
			
 
				-            df = df[required_columns]
			
 
				+            # Map datatypes dynamically to avoid InnoDB row size limits
			
 
				+            # Code is VARCHAR(50), everything else is TEXT (strings) or DOUBLE (if we were casting, but we read as str)
			
 
				+            # Since we read dtype=str, pandas will default all to TEXT which is perfect for Off-Page storage.
			
 
				+            sql_dtypes = {col: TEXT() for col in df.columns}
			
 
				+            sql_dtypes['code'] = VARCHAR(50)
			
 
				             
			
 
				+            if is_first_chunk:
			
 
				+                # 1. Initialize the target table with the exact schema from the first chunk
			
 
				+                df.head(0).to_sql('products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
			
 
				+                
			
 
				+                # 2. Add Primary Key immediately
			
 
				+                with engine.begin() as conn:
			
 
				+                    conn.execute(text("ALTER TABLE products ADD PRIMARY KEY (code);"))
			
 
				+                is_first_chunk = False
			
 
				+
			
 
				             # Write chunk to a temporary table
			
 
				-            df.to_sql('temp_products', con=engine, if_exists='replace', index=False)
			
 
				+            df.to_sql('temp_products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
			
 
				             
			
 
				             # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
			
 
				             with engine.begin() as connection:
			
 
				-                connection.execute(text("INSERT IGNORE INTO products SELECT * FROM temp_products"))
			
 
				+                # Ensure columns match by explicitly listing them
			
 
				+                cols = ", ".join([f"`{c}`" for c in df.columns])
			
 
				+                connection.execute(text(f"INSERT IGNORE INTO products ({cols}) SELECT {cols} FROM temp_products"))
			
 
				             
			
 
				             total_processed += len(df)
			
 
				-            print(f"   Successfully appended {total_processed} rows into unified schema...", end="\r")
			
 
				+            print(f"   Successfully appended {total_processed} rows into unified dynamic schema...", end="\r")
			
 
				         except BaseException as e:
			
 
				             print(f"\n   [Warning] Chunk skipped due to error: {e}")
			
 
				             
			
@@ -79,8 +79,42 @@ def ingest_file(filename, engine):
 
				     print(f"\n✅ Finished importing {filename}.")
			
 
				     return True
			
 
				 
			
 
				+def create_indexes(engine):
			
 
				+    print("\n🛠️ Creating performance indexes (FULLTEXT and Standard)...")
			
 
				+    try:
			
 
				+        with engine.begin() as connection:
			
 
				+            # Add Fulltext Search on vital textual fields if they exist
			
 
				+            try:
			
 
				+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_search (product_name, ingredients_text);"))
			
 
				+                print("  - Added FULLTEXT index on product_name, ingredients_text")
			
 
				+            except Exception as e:
			
 
				+                print(f"  - Skipped FULLTEXT idx_search: {e}")
			
 
				+                
			
 
				+            try:
			
 
				+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_allergens (allergens);"))
			
 
				+                print("  - Added FULLTEXT index on allergens")
			
 
				+            except Exception as e:
			
 
				+                print(f"  - Skipped FULLTEXT idx_allergens: {e}")
			
 
				+
			
 
				+            # Standard indexes for fast exact matches
			
 
				+            try:
			
 
				+                connection.execute(text("ALTER TABLE products ADD INDEX idx_brands (brands(50));"))
			
 
				+                print("  - Added INDEX on brands")
			
 
				+            except Exception as e:
			
 
				+                print(f"  - Skipped INDEX idx_brands: {e}")
			
 
				+                
			
 
				+            try:
			
 
				+                connection.execute(text("ALTER TABLE products ADD INDEX idx_generic (generic_name(50));"))
			
 
				+                print("  - Added INDEX on generic_name")
			
 
				+            except Exception as e:
			
 
				+                print(f"  - Skipped INDEX idx_generic: {e}")
			
 
				+
			
 
				+        print("✅ Indexing Complete!")
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Indexing encountered an issue: {e}")
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				-    print("Initiating OpenFoodFacts CSV Unified Ingestion Process...")
			
 
				+    print("Initiating OpenFoodFacts CSV Unified Dynamic Ingestion Process...")
			
 
				     engine = get_loader_engine()
			
 
				     
			
 
				     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
			
@@ -90,4 +124,5 @@ if __name__ == "__main__":
 
				         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
			
 
				         print("Please download them directly into the root folder and run this script again.")
			
 
				     else:
			
 
				+        create_indexes(engine)
			
 
				         print("\n🎉 Full database reload complete! Ready for AI RAG.")
			
--- a/setup_db.py
+++ b/setup_db.py
@@ -125,48 +125,7 @@ def run_db_setup():
 
				     ) ENGINE=InnoDB;
			
 
				     """)
			
 
				 
			
 
				-    # 4. Products Table (Unified)
			
 
				-    for i in range(1, 101): # Drop up to 100 partitions just in case
			
 
				-        cursor.execute(f"DROP TABLE IF EXISTS food_db.products_{i};")
			
 
				-    cursor.execute("DROP VIEW IF EXISTS food_db.products;")
			
 
				-    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
			
 
				-    
			
 
				-    cursor.execute("""
			
 
				-    CREATE TABLE IF NOT EXISTS food_db.products (
			
 
				-        code VARCHAR(50) PRIMARY KEY,
			
 
				-        product_name TEXT NULL,
			
 
				-        generic_name TEXT NULL,
			
 
				-        brands TEXT NULL,
			
 
				-        allergens TEXT NULL,
			
 
				-        ingredients_text TEXT NULL,
			
 
				-        proteins_100g DOUBLE NULL,
			
 
				-        fat_100g DOUBLE NULL,
			
 
				-        carbohydrates_100g DOUBLE NULL,
			
 
				-        sugars_100g DOUBLE NULL,
			
 
				-        sodium_100g DOUBLE NULL,
			
 
				-        salt_100g DOUBLE NULL,
			
 
				-        `energy-kcal_100g` DOUBLE NULL,
			
 
				-        `vitamin-a_100g` DOUBLE NULL,
			
 
				-        `vitamin-d_100g` DOUBLE NULL,
			
 
				-        `vitamin-e_100g` DOUBLE NULL,
			
 
				-        `vitamin-k_100g` DOUBLE NULL,
			
 
				-        `vitamin-c_100g` DOUBLE NULL,
			
 
				-        `vitamin-b1_100g` DOUBLE NULL,
			
 
				-        `vitamin-b2_100g` DOUBLE NULL,
			
 
				-        `vitamin-pp_100g` DOUBLE NULL,
			
 
				-        `vitamin-b6_100g` DOUBLE NULL,
			
 
				-        `vitamin-b9_100g` DOUBLE NULL,
			
 
				-        `vitamin-b12_100g` DOUBLE NULL,
			
 
				-        calcium_100g DOUBLE NULL,
			
 
				-        iron_100g DOUBLE NULL,
			
 
				-        magnesium_100g DOUBLE NULL,
			
 
				-        zinc_100g DOUBLE NULL,
			
 
				-        potassium_100g DOUBLE NULL,
			
 
				-        cholesterol_100g DOUBLE NULL,
			
 
				-        fiber_100g DOUBLE NULL,
			
 
				-        FULLTEXT idx_search (product_name, ingredients_text)
			
 
				-    ) ENGINE=InnoDB;
			
 
				-    """)
			
 
				+    # The products table is now dynamically generated by ingest_csv.py to support all ~200 columns.
			
 
				     
			
 
				     # Table Context Grants (PoLP)
			
 
				     # The authenticated app process can handle credentials and now read/write custom plates!
			
@@ -175,7 +134,7 @@ def run_db_setup():
 
				     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plates TO 'db_app_auth'@'%';")
			
 
				     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plate_items TO 'db_app_auth'@'%';")
			
 
				     
			
 
				-    # Give the app read privileges on the whole database (including the products view when created)
			
 
				+    # Give the app read privileges on the whole database
			
 
				     cursor.execute("GRANT SELECT ON food_db.* TO 'db_app_auth'@'%';")
			
 
				     
			
 
				     cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")
			
--- a/unit_converter.py
+++ b/unit_converter.py
@@ -0,0 +1,161 @@
 
				+import re
			
 
				+
			
 
				+class UnitConverter:
			
 
				+    """
			
 
				+    Utility class to convert culinary volumetric units to metric weight (grams)
			
 
				+    based on the specific product density.
			
 
				+    """
			
 
				+    
			
 
				+    # Common culinary volumetric units and their approximate volume in milliliters (ml)
			
 
				+    VOLUME_UNITS_ML = {
			
 
				+        'tsp': 5.0,
			
 
				+        'teaspoon': 5.0,
			
 
				+        'tbsp': 15.0,
			
 
				+        'tablespoon': 15.0,
			
 
				+        'cup': 240.0,
			
 
				+        'fl oz': 30.0,
			
 
				+        'fluid ounce': 30.0,
			
 
				+        'pint': 473.0,
			
 
				+        'quart': 946.0,
			
 
				+        'gallon': 3785.0,
			
 
				+        'cm3': 1.0,
			
 
				+        'cl': 10.0,
			
 
				+        'dl': 100.0,
			
 
				+        'l': 1000.0,
			
 
				+        'liter': 1000.0,
			
 
				+        'pinch': 0.36, # rough estimate
			
 
				+        'dash': 0.72,
			
 
				+    }
			
 
				+
			
 
				+    # Densities in grams per milliliter (g/ml)
			
 
				+    PRODUCT_DENSITIES = {
			
 
				+        # Baking and flours
			
 
				+        'flour': 0.53,
			
 
				+        'all-purpose flour': 0.53,
			
 
				+        'wheat flour': 0.53,
			
 
				+        'sugar': 0.85,
			
 
				+        'white sugar': 0.85,
			
 
				+        'granulated sugar': 0.85,
			
 
				+        'powdered sugar': 0.50,
			
 
				+        'icing sugar': 0.50,
			
 
				+        'brown sugar': 0.83,
			
 
				+        'salt': 1.20,
			
 
				+        'table salt': 1.20,
			
 
				+        'baking powder': 0.90,
			
 
				+        'baking soda': 1.10,
			
 
				+        'cocoa powder': 0.42,
			
 
				+        
			
 
				+        # Liquids
			
 
				+        'water': 1.0,
			
 
				+        'milk': 1.03,
			
 
				+        'heavy cream': 0.99,
			
 
				+        'vegetable oil': 0.92,
			
 
				+        'olive oil': 0.92,
			
 
				+        'honey': 1.42,
			
 
				+        'maple syrup': 1.32,
			
 
				+        'butter': 0.96, # melted or solid approx
			
 
				+        'melted butter': 0.94,
			
 
				+        
			
 
				+        # Grains and dry goods
			
 
				+        'rice': 0.85,
			
 
				+        'white rice': 0.85,
			
 
				+        'oats': 0.38,
			
 
				+        'rolled oats': 0.38,
			
 
				+        'quinoa': 0.72,
			
 
				+        'couscous': 0.72,
			
 
				+        'lentils': 0.85,
			
 
				+        
			
 
				+        # Condiments
			
 
				+        'ketchup': 1.15,
			
 
				+        'mustard': 1.05,
			
 
				+        'mayonnaise': 0.95,
			
 
				+        'peanut butter': 1.08,
			
 
				+        
			
 
				+        # Default density for unknown items (approximate density of water/mixed food)
			
 
				+        'default': 1.0
			
 
				+    }
			
 
				+    
			
 
				+    # Direct weight conversions (already in weight, just need unit conversion)
			
 
				+    WEIGHT_UNITS_G = {
			
 
				+        'g': 1.0,
			
 
				+        'gram': 1.0,
			
 
				+        'kg': 1000.0,
			
 
				+        'kilo': 1000.0,
			
 
				+        'kilogram': 1000.0,
			
 
				+        'oz': 28.35,
			
 
				+        'ounce': 28.35,
			
 
				+        'lb': 453.59,
			
 
				+        'pound': 453.59,
			
 
				+        'mg': 0.001
			
 
				+    }
			
 
				+
			
 
				+    @classmethod
			
 
				+    def get_density(cls, product_name):
			
 
				+        if not product_name:
			
 
				+            return cls.PRODUCT_DENSITIES['default']
			
 
				+            
			
 
				+        product_name = str(product_name).lower().strip()
			
 
				+        
			
 
				+        # Exact match
			
 
				+        if product_name in cls.PRODUCT_DENSITIES:
			
 
				+            return cls.PRODUCT_DENSITIES[product_name]
			
 
				+            
			
 
				+        # Partial match
			
 
				+        for key, density in cls.PRODUCT_DENSITIES.items():
			
 
				+            if key in product_name:
			
 
				+                return density
			
 
				+                
			
 
				+        return cls.PRODUCT_DENSITIES['default']
			
 
				+
			
 
				+    @classmethod
			
 
				+    def convert_to_grams(cls, amount, unit, product_name=None):
			
 
				+        """
			
 
				+        Converts an amount and unit of a specific product to grams.
			
 
				+        """
			
 
				+        unit = str(unit).lower().strip()
			
 
				+        
			
 
				+        # If it's already a weight unit, simple scalar conversion
			
 
				+        for w_unit, g_factor in cls.WEIGHT_UNITS_G.items():
			
 
				+            if unit == w_unit or unit == f"{w_unit}s":
			
 
				+                return amount * g_factor
			
 
				+                
			
 
				+        # If it's a volumetric unit, use density
			
 
				+        volume_ml = None
			
 
				+        for v_unit, ml_factor in cls.VOLUME_UNITS_ML.items():
			
 
				+            if unit == v_unit or unit == f"{v_unit}s":
			
 
				+                volume_ml = amount * ml_factor
			
 
				+                break
			
 
				+                
			
 
				+        if volume_ml is not None:
			
 
				+            density = cls.get_density(product_name)
			
 
				+            return volume_ml * density
			
 
				+            
			
 
				+        # Unrecognized unit
			
 
				+        return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def parse_and_convert(cls, recipe_string, product_name=None):
			
 
				+        """
			
 
				+        Parses a string like "1.5 cups" or "2 tbsp" and converts to grams.
			
 
				+        """
			
 
				+        # Match number (including decimals/fractions roughly) followed by text
			
 
				+        match = re.match(r'^([\d\.]+)\s*([a-zA-Z\s]+)$', str(recipe_string).strip())
			
 
				+        if match:
			
 
				+            try:
			
 
				+                amount = float(match.group(1))
			
 
				+                unit = match.group(2).strip()
			
 
				+                result = cls.convert_to_grams(amount, unit, product_name)
			
 
				+                if result is not None:
			
 
				+                    return round(result, 2)
			
 
				+            except ValueError:
			
 
				+                pass
			
 
				+        return None
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # Tests
			
 
				+    print("1 cup of all-purpose flour:", UnitConverter.parse_and_convert("1 cup", "all-purpose flour"), "g")
			
 
				+    print("1 cup of white sugar:", UnitConverter.parse_and_convert("1 cup", "white sugar"), "g")
			
 
				+    print("1 cup of water:", UnitConverter.parse_and_convert("1 cup", "water"), "g")
			
 
				+    print("2 tbsp of olive oil:", UnitConverter.parse_and_convert("2 tbsp", "olive oil"), "g")
			
 
				+    print("1 pound of generic food:", UnitConverter.parse_and_convert("1 pound", "unknown"), "g")
			
 
				+    print("1 pinch of salt:", UnitConverter.parse_and_convert("1 pinch", "salt"), "g")