2 weeks ago · 620543f87d
--- a/ingest_csv.py
+++ b/ingest_csv.py
@@ -2,6 +2,7 @@ import pandas as pd
 
															 import myloginpath
														
 
															 import urllib.parse
														
 
															 from sqlalchemy import create_engine, text
														
 
															+from sqlalchemy.types import VARCHAR, TEXT, DOUBLE
														
 
															 import os
														
 
															 import sys
														
@@ -25,25 +26,15 @@ def ingest_file(filename, engine):
 
															         print(f"File {filename} not found locally.")
														
 
															         return False
														
 
															-    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion into unified table...")
														
 
															+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion for ALL columns...")
														
 
															     chunk_size = 10000 
														
 
															     total_processed = 0
														
 
															-
														
 
															-    required_columns = [
														
 
															-        'code', 'product_name', 'generic_name', 'brands', 'allergens', 'ingredients_text',
														
 
															-        'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'sodium_100g', 'salt_100g',
														
 
															-        'energy-kcal_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g',
														
 
															-        'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g',
														
 
															-        'vitamin-b9_100g', 'vitamin-b12_100g', 'calcium_100g', 'iron_100g', 'magnesium_100g',
														
 
															-        'zinc_100g', 'potassium_100g', 'cholesterol_100g', 'fiber_100g'
														
 
															-    ]
														
 
															+    is_first_chunk = True
														
 
															     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
														
 
															         try:
														
 
															-            # Filter to only the columns that actually exist in this chunk and are in required_columns
														
 
															-            available_cols = [c for c in required_columns if c in chunk.columns]
														
 
															-            df = chunk[available_cols].copy()
														
 
															+            df = chunk.copy()
														
 
															             if 'code' not in df.columns:
														
 
															                 continue
														
@@ -52,23 +43,32 @@ def ingest_file(filename, engine):
 
															             df.dropna(subset=['code'], inplace=True)
														
 
															             df.drop_duplicates(subset=['code'], inplace=True)
														
 
															-            # Ensure all required columns exist in the dataframe (fill missing with None)
														
 
															-            for col in required_columns:
														
 
															-                if col not in df.columns:
														
 
															-                    df[col] = None
														
 
															-                    
														
 
															-            # Reorder columns to exactly match the target table schema
														
 
															-            df = df[required_columns]
														
 
															+            # Map datatypes dynamically to avoid InnoDB row size limits
														
 
															+            # Code is VARCHAR(50), everything else is TEXT (strings) or DOUBLE (if we were casting, but we read as str)
														
 
															+            # Since we read dtype=str, pandas will default all to TEXT which is perfect for Off-Page storage.
														
 
															+            sql_dtypes = {col: TEXT() for col in df.columns}
														
 
															+            sql_dtypes['code'] = VARCHAR(50)
														
 
															+            if is_first_chunk:
														
 
															+                # 1. Initialize the target table with the exact schema from the first chunk
														
 
															+                df.head(0).to_sql('products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
														
 
															+                
														
 
															+                # 2. Add Primary Key immediately
														
 
															+                with engine.begin() as conn:
														
 
															+                    conn.execute(text("ALTER TABLE products ADD PRIMARY KEY (code);"))
														
 
															+                is_first_chunk = False
														
 
															+
														
 
															             # Write chunk to a temporary table
														
 
															-            df.to_sql('temp_products', con=engine, if_exists='replace', index=False)
														
 
															+            df.to_sql('temp_products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
														
 
															             # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
														
 
															             with engine.begin() as connection:
														
 
															-                connection.execute(text("INSERT IGNORE INTO products SELECT * FROM temp_products"))
														
 
															+                # Ensure columns match by explicitly listing them
														
 
															+                cols = ", ".join([f"`{c}`" for c in df.columns])
														
 
															+                connection.execute(text(f"INSERT IGNORE INTO products ({cols}) SELECT {cols} FROM temp_products"))
														
 
															             total_processed += len(df)
														
 
															-            print(f"   Successfully appended {total_processed} rows into unified schema...", end="\r")
														
 
															+            print(f"   Successfully appended {total_processed} rows into unified dynamic schema...", end="\r")
														
 
															         except BaseException as e:
														
 
															             print(f"\n   [Warning] Chunk skipped due to error: {e}")
														
@@ -79,8 +79,42 @@ def ingest_file(filename, engine):
 
															     print(f"\n✅ Finished importing {filename}.")
														
 
															     return True
														
 
															+def create_indexes(engine):
														
 
															+    print("\n🛠️ Creating performance indexes (FULLTEXT and Standard)...")
														
 
															+    try:
														
 
															+        with engine.begin() as connection:
														
 
															+            # Add Fulltext Search on vital textual fields if they exist
														
 
															+            try:
														
 
															+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_search (product_name, ingredients_text);"))
														
 
															+                print("  - Added FULLTEXT index on product_name, ingredients_text")
														
 
															+            except Exception as e:
														
 
															+                print(f"  - Skipped FULLTEXT idx_search: {e}")
														
 
															+                
														
 
															+            try:
														
 
															+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_allergens (allergens);"))
														
 
															+                print("  - Added FULLTEXT index on allergens")
														
 
															+            except Exception as e:
														
 
															+                print(f"  - Skipped FULLTEXT idx_allergens: {e}")
														
 
															+
														
 
															+            # Standard indexes for fast exact matches
														
 
															+            try:
														
 
															+                connection.execute(text("ALTER TABLE products ADD INDEX idx_brands (brands(50));"))
														
 
															+                print("  - Added INDEX on brands")
														
 
															+            except Exception as e:
														
 
															+                print(f"  - Skipped INDEX idx_brands: {e}")
														
 
															+                
														
 
															+            try:
														
 
															+                connection.execute(text("ALTER TABLE products ADD INDEX idx_generic (generic_name(50));"))
														
 
															+                print("  - Added INDEX on generic_name")
														
 
															+            except Exception as e:
														
 
															+                print(f"  - Skipped INDEX idx_generic: {e}")
														
 
															+
														
 
															+        print("✅ Indexing Complete!")
														
 
															+    except Exception as e:
														
 
															+        print(f"❌ Indexing encountered an issue: {e}")
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															-    print("Initiating OpenFoodFacts CSV Unified Ingestion Process...")
														
 
															+    print("Initiating OpenFoodFacts CSV Unified Dynamic Ingestion Process...")
														
 
															     engine = get_loader_engine()
														
 
															     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
														
@@ -90,4 +124,5 @@ if __name__ == "__main__":
 
															         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
														
 
															         print("Please download them directly into the root folder and run this script again.")
														
 
															     else:
														
 
															+        create_indexes(engine)
														
 
															         print("\n🎉 Full database reload complete! Ready for AI RAG.")
														
--- a/setup_db.py
+++ b/setup_db.py
@@ -125,48 +125,7 @@ def run_db_setup():
 
															     ) ENGINE=InnoDB;
														
 
															     """)
														
 
															-    # 4. Products Table (Unified)
														
 
															-    for i in range(1, 101): # Drop up to 100 partitions just in case
														
 
															-        cursor.execute(f"DROP TABLE IF EXISTS food_db.products_{i};")
														
 
															-    cursor.execute("DROP VIEW IF EXISTS food_db.products;")
														
 
															-    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
														
 
															-    
														
 
															-    cursor.execute("""
														
 
															-    CREATE TABLE IF NOT EXISTS food_db.products (
														
 
															-        code VARCHAR(50) PRIMARY KEY,
														
 
															-        product_name TEXT NULL,
														
 
															-        generic_name TEXT NULL,
														
 
															-        brands TEXT NULL,
														
 
															-        allergens TEXT NULL,
														
 
															-        ingredients_text TEXT NULL,
														
 
															-        proteins_100g DOUBLE NULL,
														
 
															-        fat_100g DOUBLE NULL,
														
 
															-        carbohydrates_100g DOUBLE NULL,
														
 
															-        sugars_100g DOUBLE NULL,
														
 
															-        sodium_100g DOUBLE NULL,
														
 
															-        salt_100g DOUBLE NULL,
														
 
															-        `energy-kcal_100g` DOUBLE NULL,
														
 
															-        `vitamin-a_100g` DOUBLE NULL,
														
 
															-        `vitamin-d_100g` DOUBLE NULL,
														
 
															-        `vitamin-e_100g` DOUBLE NULL,
														
 
															-        `vitamin-k_100g` DOUBLE NULL,
														
 
															-        `vitamin-c_100g` DOUBLE NULL,
														
 
															-        `vitamin-b1_100g` DOUBLE NULL,
														
 
															-        `vitamin-b2_100g` DOUBLE NULL,
														
 
															-        `vitamin-pp_100g` DOUBLE NULL,
														
 
															-        `vitamin-b6_100g` DOUBLE NULL,
														
 
															-        `vitamin-b9_100g` DOUBLE NULL,
														
 
															-        `vitamin-b12_100g` DOUBLE NULL,
														
 
															-        calcium_100g DOUBLE NULL,
														
 
															-        iron_100g DOUBLE NULL,
														
 
															-        magnesium_100g DOUBLE NULL,
														
 
															-        zinc_100g DOUBLE NULL,
														
 
															-        potassium_100g DOUBLE NULL,
														
 
															-        cholesterol_100g DOUBLE NULL,
														
 
															-        fiber_100g DOUBLE NULL,
														
 
															-        FULLTEXT idx_search (product_name, ingredients_text)
														
 
															-    ) ENGINE=InnoDB;
														
 
															-    """)
														
 
															+    # The products table is now dynamically generated by ingest_csv.py to support all ~200 columns.
														
 
															     # Table Context Grants (PoLP)
														
 
															     # The authenticated app process can handle credentials and now read/write custom plates!
														
@@ -175,7 +134,7 @@ def run_db_setup():
 
															     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plates TO 'db_app_auth'@'%';")
														
 
															     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plate_items TO 'db_app_auth'@'%';")
														
 
															-    # Give the app read privileges on the whole database (including the products view when created)
														
 
															+    # Give the app read privileges on the whole database
														
 
															     cursor.execute("GRANT SELECT ON food_db.* TO 'db_app_auth'@'%';")
														
 
															     cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")
														
--- a/unit_converter.py
+++ b/unit_converter.py
@@ -0,0 +1,161 @@
 
															+import re
														
 
															+
														
 
															+class UnitConverter:
														
 
															+    """
														
 
															+    Utility class to convert culinary volumetric units to metric weight (grams)
														
 
															+    based on the specific product density.
														
 
															+    """
														
 
															+    
														
 
															+    # Common culinary volumetric units and their approximate volume in milliliters (ml)
														
 
															+    VOLUME_UNITS_ML = {
														
 
															+        'tsp': 5.0,
														
 
															+        'teaspoon': 5.0,
														
 
															+        'tbsp': 15.0,
														
 
															+        'tablespoon': 15.0,
														
 
															+        'cup': 240.0,
														
 
															+        'fl oz': 30.0,
														
 
															+        'fluid ounce': 30.0,
														
 
															+        'pint': 473.0,
														
 
															+        'quart': 946.0,
														
 
															+        'gallon': 3785.0,
														
 
															+        'cm3': 1.0,
														
 
															+        'cl': 10.0,
														
 
															+        'dl': 100.0,
														
 
															+        'l': 1000.0,
														
 
															+        'liter': 1000.0,
														
 
															+        'pinch': 0.36, # rough estimate
														
 
															+        'dash': 0.72,
														
 
															+    }
														
 
															+
														
 
															+    # Densities in grams per milliliter (g/ml)
														
 
															+    PRODUCT_DENSITIES = {
														
 
															+        # Baking and flours
														
 
															+        'flour': 0.53,
														
 
															+        'all-purpose flour': 0.53,
														
 
															+        'wheat flour': 0.53,
														
 
															+        'sugar': 0.85,
														
 
															+        'white sugar': 0.85,
														
 
															+        'granulated sugar': 0.85,
														
 
															+        'powdered sugar': 0.50,
														
 
															+        'icing sugar': 0.50,
														
 
															+        'brown sugar': 0.83,
														
 
															+        'salt': 1.20,
														
 
															+        'table salt': 1.20,
														
 
															+        'baking powder': 0.90,
														
 
															+        'baking soda': 1.10,
														
 
															+        'cocoa powder': 0.42,
														
 
															+        
														
 
															+        # Liquids
														
 
															+        'water': 1.0,
														
 
															+        'milk': 1.03,
														
 
															+        'heavy cream': 0.99,
														
 
															+        'vegetable oil': 0.92,
														
 
															+        'olive oil': 0.92,
														
 
															+        'honey': 1.42,
														
 
															+        'maple syrup': 1.32,
														
 
															+        'butter': 0.96, # melted or solid approx
														
 
															+        'melted butter': 0.94,
														
 
															+        
														
 
															+        # Grains and dry goods
														
 
															+        'rice': 0.85,
														
 
															+        'white rice': 0.85,
														
 
															+        'oats': 0.38,
														
 
															+        'rolled oats': 0.38,
														
 
															+        'quinoa': 0.72,
														
 
															+        'couscous': 0.72,
														
 
															+        'lentils': 0.85,
														
 
															+        
														
 
															+        # Condiments
														
 
															+        'ketchup': 1.15,
														
 
															+        'mustard': 1.05,
														
 
															+        'mayonnaise': 0.95,
														
 
															+        'peanut butter': 1.08,
														
 
															+        
														
 
															+        # Default density for unknown items (approximate density of water/mixed food)
														
 
															+        'default': 1.0
														
 
															+    }
														
 
															+    
														
 
															+    # Direct weight conversions (already in weight, just need unit conversion)
														
 
															+    WEIGHT_UNITS_G = {
														
 
															+        'g': 1.0,
														
 
															+        'gram': 1.0,
														
 
															+        'kg': 1000.0,
														
 
															+        'kilo': 1000.0,
														
 
															+        'kilogram': 1000.0,
														
 
															+        'oz': 28.35,
														
 
															+        'ounce': 28.35,
														
 
															+        'lb': 453.59,
														
 
															+        'pound': 453.59,
														
 
															+        'mg': 0.001
														
 
															+    }
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_density(cls, product_name):
														
 
															+        if not product_name:
														
 
															+            return cls.PRODUCT_DENSITIES['default']
														
 
															+            
														
 
															+        product_name = str(product_name).lower().strip()
														
 
															+        
														
 
															+        # Exact match
														
 
															+        if product_name in cls.PRODUCT_DENSITIES:
														
 
															+            return cls.PRODUCT_DENSITIES[product_name]
														
 
															+            
														
 
															+        # Partial match
														
 
															+        for key, density in cls.PRODUCT_DENSITIES.items():
														
 
															+            if key in product_name:
														
 
															+                return density
														
 
															+                
														
 
															+        return cls.PRODUCT_DENSITIES['default']
														
 
															+
														
 
															+    @classmethod
														
 
															+    def convert_to_grams(cls, amount, unit, product_name=None):
														
 
															+        """
														
 
															+        Converts an amount and unit of a specific product to grams.
														
 
															+        """
														
 
															+        unit = str(unit).lower().strip()
														
 
															+        
														
 
															+        # If it's already a weight unit, simple scalar conversion
														
 
															+        for w_unit, g_factor in cls.WEIGHT_UNITS_G.items():
														
 
															+            if unit == w_unit or unit == f"{w_unit}s":
														
 
															+                return amount * g_factor
														
 
															+                
														
 
															+        # If it's a volumetric unit, use density
														
 
															+        volume_ml = None
														
 
															+        for v_unit, ml_factor in cls.VOLUME_UNITS_ML.items():
														
 
															+            if unit == v_unit or unit == f"{v_unit}s":
														
 
															+                volume_ml = amount * ml_factor
														
 
															+                break
														
 
															+                
														
 
															+        if volume_ml is not None:
														
 
															+            density = cls.get_density(product_name)
														
 
															+            return volume_ml * density
														
 
															+            
														
 
															+        # Unrecognized unit
														
 
															+        return None
														
 
															+
														
 
															+    @classmethod
														
 
															+    def parse_and_convert(cls, recipe_string, product_name=None):
														
 
															+        """
														
 
															+        Parses a string like "1.5 cups" or "2 tbsp" and converts to grams.
														
 
															+        """
														
 
															+        # Match number (including decimals/fractions roughly) followed by text
														
 
															+        match = re.match(r'^([\d\.]+)\s*([a-zA-Z\s]+)$', str(recipe_string).strip())
														
 
															+        if match:
														
 
															+            try:
														
 
															+                amount = float(match.group(1))
														
 
															+                unit = match.group(2).strip()
														
 
															+                result = cls.convert_to_grams(amount, unit, product_name)
														
 
															+                if result is not None:
														
 
															+                    return round(result, 2)
														
 
															+            except ValueError:
														
 
															+                pass
														
 
															+        return None
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # Tests
														
 
															+    print("1 cup of all-purpose flour:", UnitConverter.parse_and_convert("1 cup", "all-purpose flour"), "g")
														
 
															+    print("1 cup of white sugar:", UnitConverter.parse_and_convert("1 cup", "white sugar"), "g")
														
 
															+    print("1 cup of water:", UnitConverter.parse_and_convert("1 cup", "water"), "g")
														
 
															+    print("2 tbsp of olive oil:", UnitConverter.parse_and_convert("2 tbsp", "olive oil"), "g")
														
 
															+    print("1 pound of generic food:", UnitConverter.parse_and_convert("1 pound", "unknown"), "g")
														
 
															+    print("1 pinch of salt:", UnitConverter.parse_and_convert("1 pinch", "salt"), "g")