Browse Source

Implement full dynamic CSV schema ingestion and unit conversion module

lanfr144 2 weeks ago
parent
commit
620543f87d
3 changed files with 222 additions and 67 deletions
  1. 59 24
      ingest_csv.py
  2. 2 43
      setup_db.py
  3. 161 0
      unit_converter.py

+ 59 - 24
ingest_csv.py

@@ -2,6 +2,7 @@ import pandas as pd
 import myloginpath
 import myloginpath
 import urllib.parse
 import urllib.parse
 from sqlalchemy import create_engine, text
 from sqlalchemy import create_engine, text
+from sqlalchemy.types import VARCHAR, TEXT, DOUBLE
 import os
 import os
 import sys
 import sys
 
 
@@ -25,25 +26,15 @@ def ingest_file(filename, engine):
         print(f"File {filename} not found locally.")
         print(f"File {filename} not found locally.")
         return False
         return False
         
         
-    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion into unified table...")
+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion for ALL columns...")
     
     
     chunk_size = 10000 
     chunk_size = 10000 
     total_processed = 0
     total_processed = 0
-
-    required_columns = [
-        'code', 'product_name', 'generic_name', 'brands', 'allergens', 'ingredients_text',
-        'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'sodium_100g', 'salt_100g',
-        'energy-kcal_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g',
-        'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g',
-        'vitamin-b9_100g', 'vitamin-b12_100g', 'calcium_100g', 'iron_100g', 'magnesium_100g',
-        'zinc_100g', 'potassium_100g', 'cholesterol_100g', 'fiber_100g'
-    ]
+    is_first_chunk = True
 
 
     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
         try:
         try:
-            # Filter to only the columns that actually exist in this chunk and are in required_columns
-            available_cols = [c for c in required_columns if c in chunk.columns]
-            df = chunk[available_cols].copy()
+            df = chunk.copy()
             
             
             if 'code' not in df.columns:
             if 'code' not in df.columns:
                 continue
                 continue
@@ -52,23 +43,32 @@ def ingest_file(filename, engine):
             df.dropna(subset=['code'], inplace=True)
             df.dropna(subset=['code'], inplace=True)
             df.drop_duplicates(subset=['code'], inplace=True)
             df.drop_duplicates(subset=['code'], inplace=True)
             
             
-            # Ensure all required columns exist in the dataframe (fill missing with None)
-            for col in required_columns:
-                if col not in df.columns:
-                    df[col] = None
-                    
-            # Reorder columns to exactly match the target table schema
-            df = df[required_columns]
+            # Map datatypes dynamically to avoid InnoDB row size limits
+            # Code is VARCHAR(50), everything else is TEXT (strings) or DOUBLE (if we were casting, but we read as str)
+            # Since we read dtype=str, pandas will default all to TEXT which is perfect for Off-Page storage.
+            sql_dtypes = {col: TEXT() for col in df.columns}
+            sql_dtypes['code'] = VARCHAR(50)
             
             
+            if is_first_chunk:
+                # 1. Initialize the target table with the exact schema from the first chunk
+                df.head(0).to_sql('products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
+                
+                # 2. Add Primary Key immediately
+                with engine.begin() as conn:
+                    conn.execute(text("ALTER TABLE products ADD PRIMARY KEY (code);"))
+                is_first_chunk = False
+
             # Write chunk to a temporary table
             # Write chunk to a temporary table
-            df.to_sql('temp_products', con=engine, if_exists='replace', index=False)
+            df.to_sql('temp_products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
             
             
             # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
             # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
             with engine.begin() as connection:
             with engine.begin() as connection:
-                connection.execute(text("INSERT IGNORE INTO products SELECT * FROM temp_products"))
+                # Ensure columns match by explicitly listing them
+                cols = ", ".join([f"`{c}`" for c in df.columns])
+                connection.execute(text(f"INSERT IGNORE INTO products ({cols}) SELECT {cols} FROM temp_products"))
             
             
             total_processed += len(df)
             total_processed += len(df)
-            print(f"   Successfully appended {total_processed} rows into unified schema...", end="\r")
+            print(f"   Successfully appended {total_processed} rows into unified dynamic schema...", end="\r")
         except BaseException as e:
         except BaseException as e:
             print(f"\n   [Warning] Chunk skipped due to error: {e}")
             print(f"\n   [Warning] Chunk skipped due to error: {e}")
             
             
@@ -79,8 +79,42 @@ def ingest_file(filename, engine):
     print(f"\n✅ Finished importing {filename}.")
     print(f"\n✅ Finished importing {filename}.")
     return True
     return True
 
 
+def create_indexes(engine):
+    print("\n🛠️ Creating performance indexes (FULLTEXT and Standard)...")
+    try:
+        with engine.begin() as connection:
+            # Add Fulltext Search on vital textual fields if they exist
+            try:
+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_search (product_name, ingredients_text);"))
+                print("  - Added FULLTEXT index on product_name, ingredients_text")
+            except Exception as e:
+                print(f"  - Skipped FULLTEXT idx_search: {e}")
+                
+            try:
+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_allergens (allergens);"))
+                print("  - Added FULLTEXT index on allergens")
+            except Exception as e:
+                print(f"  - Skipped FULLTEXT idx_allergens: {e}")
+
+            # Standard indexes for fast exact matches
+            try:
+                connection.execute(text("ALTER TABLE products ADD INDEX idx_brands (brands(50));"))
+                print("  - Added INDEX on brands")
+            except Exception as e:
+                print(f"  - Skipped INDEX idx_brands: {e}")
+                
+            try:
+                connection.execute(text("ALTER TABLE products ADD INDEX idx_generic (generic_name(50));"))
+                print("  - Added INDEX on generic_name")
+            except Exception as e:
+                print(f"  - Skipped INDEX idx_generic: {e}")
+
+        print("✅ Indexing Complete!")
+    except Exception as e:
+        print(f"❌ Indexing encountered an issue: {e}")
+
 if __name__ == "__main__":
 if __name__ == "__main__":
-    print("Initiating OpenFoodFacts CSV Unified Ingestion Process...")
+    print("Initiating OpenFoodFacts CSV Unified Dynamic Ingestion Process...")
     engine = get_loader_engine()
     engine = get_loader_engine()
     
     
     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
@@ -90,4 +124,5 @@ if __name__ == "__main__":
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("Please download them directly into the root folder and run this script again.")
         print("Please download them directly into the root folder and run this script again.")
     else:
     else:
+        create_indexes(engine)
         print("\n🎉 Full database reload complete! Ready for AI RAG.")
         print("\n🎉 Full database reload complete! Ready for AI RAG.")

+ 2 - 43
setup_db.py

@@ -125,48 +125,7 @@ def run_db_setup():
     ) ENGINE=InnoDB;
     ) ENGINE=InnoDB;
     """)
     """)
 
 
-    # 4. Products Table (Unified)
-    for i in range(1, 101): # Drop up to 100 partitions just in case
-        cursor.execute(f"DROP TABLE IF EXISTS food_db.products_{i};")
-    cursor.execute("DROP VIEW IF EXISTS food_db.products;")
-    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
-    
-    cursor.execute("""
-    CREATE TABLE IF NOT EXISTS food_db.products (
-        code VARCHAR(50) PRIMARY KEY,
-        product_name TEXT NULL,
-        generic_name TEXT NULL,
-        brands TEXT NULL,
-        allergens TEXT NULL,
-        ingredients_text TEXT NULL,
-        proteins_100g DOUBLE NULL,
-        fat_100g DOUBLE NULL,
-        carbohydrates_100g DOUBLE NULL,
-        sugars_100g DOUBLE NULL,
-        sodium_100g DOUBLE NULL,
-        salt_100g DOUBLE NULL,
-        `energy-kcal_100g` DOUBLE NULL,
-        `vitamin-a_100g` DOUBLE NULL,
-        `vitamin-d_100g` DOUBLE NULL,
-        `vitamin-e_100g` DOUBLE NULL,
-        `vitamin-k_100g` DOUBLE NULL,
-        `vitamin-c_100g` DOUBLE NULL,
-        `vitamin-b1_100g` DOUBLE NULL,
-        `vitamin-b2_100g` DOUBLE NULL,
-        `vitamin-pp_100g` DOUBLE NULL,
-        `vitamin-b6_100g` DOUBLE NULL,
-        `vitamin-b9_100g` DOUBLE NULL,
-        `vitamin-b12_100g` DOUBLE NULL,
-        calcium_100g DOUBLE NULL,
-        iron_100g DOUBLE NULL,
-        magnesium_100g DOUBLE NULL,
-        zinc_100g DOUBLE NULL,
-        potassium_100g DOUBLE NULL,
-        cholesterol_100g DOUBLE NULL,
-        fiber_100g DOUBLE NULL,
-        FULLTEXT idx_search (product_name, ingredients_text)
-    ) ENGINE=InnoDB;
-    """)
+    # The products table is now dynamically generated by ingest_csv.py to support all ~200 columns.
     
     
     # Table Context Grants (PoLP)
     # Table Context Grants (PoLP)
     # The authenticated app process can handle credentials and now read/write custom plates!
     # The authenticated app process can handle credentials and now read/write custom plates!
@@ -175,7 +134,7 @@ def run_db_setup():
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plates TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plates TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plate_items TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plate_items TO 'db_app_auth'@'%';")
     
     
-    # Give the app read privileges on the whole database (including the products view when created)
+    # Give the app read privileges on the whole database
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_app_auth'@'%';")
     
     
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")

+ 161 - 0
unit_converter.py

@@ -0,0 +1,161 @@
+import re
+
+class UnitConverter:
+    """
+    Utility class to convert culinary volumetric units to metric weight (grams)
+    based on the specific product density.
+    """
+    
+    # Common culinary volumetric units and their approximate volume in milliliters (ml)
+    VOLUME_UNITS_ML = {
+        'tsp': 5.0,
+        'teaspoon': 5.0,
+        'tbsp': 15.0,
+        'tablespoon': 15.0,
+        'cup': 240.0,
+        'fl oz': 30.0,
+        'fluid ounce': 30.0,
+        'pint': 473.0,
+        'quart': 946.0,
+        'gallon': 3785.0,
+        'cm3': 1.0,
+        'cl': 10.0,
+        'dl': 100.0,
+        'l': 1000.0,
+        'liter': 1000.0,
+        'pinch': 0.36, # rough estimate
+        'dash': 0.72,
+    }
+
+    # Densities in grams per milliliter (g/ml)
+    PRODUCT_DENSITIES = {
+        # Baking and flours
+        'flour': 0.53,
+        'all-purpose flour': 0.53,
+        'wheat flour': 0.53,
+        'sugar': 0.85,
+        'white sugar': 0.85,
+        'granulated sugar': 0.85,
+        'powdered sugar': 0.50,
+        'icing sugar': 0.50,
+        'brown sugar': 0.83,
+        'salt': 1.20,
+        'table salt': 1.20,
+        'baking powder': 0.90,
+        'baking soda': 1.10,
+        'cocoa powder': 0.42,
+        
+        # Liquids
+        'water': 1.0,
+        'milk': 1.03,
+        'heavy cream': 0.99,
+        'vegetable oil': 0.92,
+        'olive oil': 0.92,
+        'honey': 1.42,
+        'maple syrup': 1.32,
+        'butter': 0.96, # melted or solid approx
+        'melted butter': 0.94,
+        
+        # Grains and dry goods
+        'rice': 0.85,
+        'white rice': 0.85,
+        'oats': 0.38,
+        'rolled oats': 0.38,
+        'quinoa': 0.72,
+        'couscous': 0.72,
+        'lentils': 0.85,
+        
+        # Condiments
+        'ketchup': 1.15,
+        'mustard': 1.05,
+        'mayonnaise': 0.95,
+        'peanut butter': 1.08,
+        
+        # Default density for unknown items (approximate density of water/mixed food)
+        'default': 1.0
+    }
+    
+    # Direct weight conversions (already in weight, just need unit conversion)
+    WEIGHT_UNITS_G = {
+        'g': 1.0,
+        'gram': 1.0,
+        'kg': 1000.0,
+        'kilo': 1000.0,
+        'kilogram': 1000.0,
+        'oz': 28.35,
+        'ounce': 28.35,
+        'lb': 453.59,
+        'pound': 453.59,
+        'mg': 0.001
+    }
+
+    @classmethod
+    def get_density(cls, product_name):
+        if not product_name:
+            return cls.PRODUCT_DENSITIES['default']
+            
+        product_name = str(product_name).lower().strip()
+        
+        # Exact match
+        if product_name in cls.PRODUCT_DENSITIES:
+            return cls.PRODUCT_DENSITIES[product_name]
+            
+        # Partial match
+        for key, density in cls.PRODUCT_DENSITIES.items():
+            if key in product_name:
+                return density
+                
+        return cls.PRODUCT_DENSITIES['default']
+
+    @classmethod
+    def convert_to_grams(cls, amount, unit, product_name=None):
+        """
+        Converts an amount and unit of a specific product to grams.
+        """
+        unit = str(unit).lower().strip()
+        
+        # If it's already a weight unit, simple scalar conversion
+        for w_unit, g_factor in cls.WEIGHT_UNITS_G.items():
+            if unit == w_unit or unit == f"{w_unit}s":
+                return amount * g_factor
+                
+        # If it's a volumetric unit, use density
+        volume_ml = None
+        for v_unit, ml_factor in cls.VOLUME_UNITS_ML.items():
+            if unit == v_unit or unit == f"{v_unit}s":
+                volume_ml = amount * ml_factor
+                break
+                
+        if volume_ml is not None:
+            density = cls.get_density(product_name)
+            return volume_ml * density
+            
+        # Unrecognized unit
+        return None
+
+    @classmethod
+    def parse_and_convert(cls, recipe_string, product_name=None):
+        """
+        Parses a string like "1.5 cups" or "2 tbsp" and converts to grams.
+        """
+        # Match number (including decimals/fractions roughly) followed by text
+        match = re.match(r'^([\d\.]+)\s*([a-zA-Z\s]+)$', str(recipe_string).strip())
+        if match:
+            try:
+                amount = float(match.group(1))
+                unit = match.group(2).strip()
+                result = cls.convert_to_grams(amount, unit, product_name)
+                if result is not None:
+                    return round(result, 2)
+            except ValueError:
+                pass
+        return None
+
+if __name__ == '__main__':
+    # Tests
+    print("1 cup of all-purpose flour:", UnitConverter.parse_and_convert("1 cup", "all-purpose flour"), "g")
+    print("1 cup of white sugar:", UnitConverter.parse_and_convert("1 cup", "white sugar"), "g")
+    print("1 cup of water:", UnitConverter.parse_and_convert("1 cup", "water"), "g")
+    print("2 tbsp of olive oil:", UnitConverter.parse_and_convert("2 tbsp", "olive oil"), "g")
+    print("1 pound of generic food:", UnitConverter.parse_and_convert("1 pound", "unknown"), "g")
+    print("1 pinch of salt:", UnitConverter.parse_and_convert("1 pinch", "salt"), "g")