Explorar el Código

Implement full dynamic CSV schema ingestion and unit conversion module

lanfr144 hace 2 semanas
padre
commit
620543f87d
Se han modificado 3 ficheros con 222 adiciones y 67 borrados
  1. 59 24
      ingest_csv.py
  2. 2 43
      setup_db.py
  3. 161 0
      unit_converter.py

+ 59 - 24
ingest_csv.py

@@ -2,6 +2,7 @@ import pandas as pd
 import myloginpath
 import urllib.parse
 from sqlalchemy import create_engine, text
+from sqlalchemy.types import VARCHAR, TEXT, DOUBLE
 import os
 import sys
 
@@ -25,25 +26,15 @@ def ingest_file(filename, engine):
         print(f"File {filename} not found locally.")
         return False
         
-    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion into unified table...")
+    print(f"\n🚀 Found {filename}! Starting extreme batch ingestion for ALL columns...")
     
     chunk_size = 10000 
     total_processed = 0
-
-    required_columns = [
-        'code', 'product_name', 'generic_name', 'brands', 'allergens', 'ingredients_text',
-        'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'sodium_100g', 'salt_100g',
-        'energy-kcal_100g', 'vitamin-a_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g',
-        'vitamin-c_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g',
-        'vitamin-b9_100g', 'vitamin-b12_100g', 'calcium_100g', 'iron_100g', 'magnesium_100g',
-        'zinc_100g', 'potassium_100g', 'cholesterol_100g', 'fiber_100g'
-    ]
+    is_first_chunk = True
 
     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
         try:
-            # Filter to only the columns that actually exist in this chunk and are in required_columns
-            available_cols = [c for c in required_columns if c in chunk.columns]
-            df = chunk[available_cols].copy()
+            df = chunk.copy()
             
             if 'code' not in df.columns:
                 continue
@@ -52,23 +43,32 @@ def ingest_file(filename, engine):
             df.dropna(subset=['code'], inplace=True)
             df.drop_duplicates(subset=['code'], inplace=True)
             
-            # Ensure all required columns exist in the dataframe (fill missing with None)
-            for col in required_columns:
-                if col not in df.columns:
-                    df[col] = None
-                    
-            # Reorder columns to exactly match the target table schema
-            df = df[required_columns]
+            # Map datatypes dynamically to avoid InnoDB row size limits
+            # Code is VARCHAR(50), everything else is TEXT (strings) or DOUBLE (if we were casting, but we read as str)
+            # Since we read dtype=str, pandas will default all to TEXT which is perfect for Off-Page storage.
+            sql_dtypes = {col: TEXT() for col in df.columns}
+            sql_dtypes['code'] = VARCHAR(50)
             
+            if is_first_chunk:
+                # 1. Initialize the target table with the exact schema from the first chunk
+                df.head(0).to_sql('products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
+                
+                # 2. Add Primary Key immediately
+                with engine.begin() as conn:
+                    conn.execute(text("ALTER TABLE products ADD PRIMARY KEY (code);"))
+                is_first_chunk = False
+
             # Write chunk to a temporary table
-            df.to_sql('temp_products', con=engine, if_exists='replace', index=False)
+            df.to_sql('temp_products', con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
             
             # Use INSERT IGNORE to append to the main table, skipping any global duplicate codes
             with engine.begin() as connection:
-                connection.execute(text("INSERT IGNORE INTO products SELECT * FROM temp_products"))
+                # Ensure columns match by explicitly listing them
+                cols = ", ".join([f"`{c}`" for c in df.columns])
+                connection.execute(text(f"INSERT IGNORE INTO products ({cols}) SELECT {cols} FROM temp_products"))
             
             total_processed += len(df)
-            print(f"   Successfully appended {total_processed} rows into unified schema...", end="\r")
+            print(f"   Successfully appended {total_processed} rows into unified dynamic schema...", end="\r")
         except BaseException as e:
             print(f"\n   [Warning] Chunk skipped due to error: {e}")
             
@@ -79,8 +79,42 @@ def ingest_file(filename, engine):
     print(f"\n✅ Finished importing {filename}.")
     return True
 
+def create_indexes(engine):
+    print("\n🛠️ Creating performance indexes (FULLTEXT and Standard)...")
+    try:
+        with engine.begin() as connection:
+            # Add Fulltext Search on vital textual fields if they exist
+            try:
+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_search (product_name, ingredients_text);"))
+                print("  - Added FULLTEXT index on product_name, ingredients_text")
+            except Exception as e:
+                print(f"  - Skipped FULLTEXT idx_search: {e}")
+                
+            try:
+                connection.execute(text("ALTER TABLE products ADD FULLTEXT idx_allergens (allergens);"))
+                print("  - Added FULLTEXT index on allergens")
+            except Exception as e:
+                print(f"  - Skipped FULLTEXT idx_allergens: {e}")
+
+            # Standard indexes for fast exact matches
+            try:
+                connection.execute(text("ALTER TABLE products ADD INDEX idx_brands (brands(50));"))
+                print("  - Added INDEX on brands")
+            except Exception as e:
+                print(f"  - Skipped INDEX idx_brands: {e}")
+                
+            try:
+                connection.execute(text("ALTER TABLE products ADD INDEX idx_generic (generic_name(50));"))
+                print("  - Added INDEX on generic_name")
+            except Exception as e:
+                print(f"  - Skipped INDEX idx_generic: {e}")
+
+        print("✅ Indexing Complete!")
+    except Exception as e:
+        print(f"❌ Indexing encountered an issue: {e}")
+
 if __name__ == "__main__":
-    print("Initiating OpenFoodFacts CSV Unified Ingestion Process...")
+    print("Initiating OpenFoodFacts CSV Unified Dynamic Ingestion Process...")
     engine = get_loader_engine()
     
     processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
@@ -90,4 +124,5 @@ if __name__ == "__main__":
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("Please download them directly into the root folder and run this script again.")
     else:
+        create_indexes(engine)
         print("\n🎉 Full database reload complete! Ready for AI RAG.")

+ 2 - 43
setup_db.py

@@ -125,48 +125,7 @@ def run_db_setup():
     ) ENGINE=InnoDB;
     """)
 
-    # 4. Products Table (Unified)
-    for i in range(1, 101): # Drop up to 100 partitions just in case
-        cursor.execute(f"DROP TABLE IF EXISTS food_db.products_{i};")
-    cursor.execute("DROP VIEW IF EXISTS food_db.products;")
-    cursor.execute("DROP TABLE IF EXISTS food_db.products;")
-    
-    cursor.execute("""
-    CREATE TABLE IF NOT EXISTS food_db.products (
-        code VARCHAR(50) PRIMARY KEY,
-        product_name TEXT NULL,
-        generic_name TEXT NULL,
-        brands TEXT NULL,
-        allergens TEXT NULL,
-        ingredients_text TEXT NULL,
-        proteins_100g DOUBLE NULL,
-        fat_100g DOUBLE NULL,
-        carbohydrates_100g DOUBLE NULL,
-        sugars_100g DOUBLE NULL,
-        sodium_100g DOUBLE NULL,
-        salt_100g DOUBLE NULL,
-        `energy-kcal_100g` DOUBLE NULL,
-        `vitamin-a_100g` DOUBLE NULL,
-        `vitamin-d_100g` DOUBLE NULL,
-        `vitamin-e_100g` DOUBLE NULL,
-        `vitamin-k_100g` DOUBLE NULL,
-        `vitamin-c_100g` DOUBLE NULL,
-        `vitamin-b1_100g` DOUBLE NULL,
-        `vitamin-b2_100g` DOUBLE NULL,
-        `vitamin-pp_100g` DOUBLE NULL,
-        `vitamin-b6_100g` DOUBLE NULL,
-        `vitamin-b9_100g` DOUBLE NULL,
-        `vitamin-b12_100g` DOUBLE NULL,
-        calcium_100g DOUBLE NULL,
-        iron_100g DOUBLE NULL,
-        magnesium_100g DOUBLE NULL,
-        zinc_100g DOUBLE NULL,
-        potassium_100g DOUBLE NULL,
-        cholesterol_100g DOUBLE NULL,
-        fiber_100g DOUBLE NULL,
-        FULLTEXT idx_search (product_name, ingredients_text)
-    ) ENGINE=InnoDB;
-    """)
+    # The products table is now dynamically generated by ingest_csv.py to support all ~200 columns.
     
     # Table Context Grants (PoLP)
     # The authenticated app process can handle credentials and now read/write custom plates!
@@ -175,7 +134,7 @@ def run_db_setup():
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plates TO 'db_app_auth'@'%';")
     cursor.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON food_db.plate_items TO 'db_app_auth'@'%';")
     
-    # Give the app read privileges on the whole database (including the products view when created)
+    # Give the app read privileges on the whole database
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_app_auth'@'%';")
     
     cursor.execute("GRANT SELECT ON food_db.* TO 'db_reader'@'%';")

+ 161 - 0
unit_converter.py

@@ -0,0 +1,161 @@
+import re
+
+class UnitConverter:
+    """
+    Utility class to convert culinary volumetric units to metric weight (grams)
+    based on the specific product density.
+    """
+    
+    # Common culinary volumetric units and their approximate volume in milliliters (ml)
+    VOLUME_UNITS_ML = {
+        'tsp': 5.0,
+        'teaspoon': 5.0,
+        'tbsp': 15.0,
+        'tablespoon': 15.0,
+        'cup': 240.0,
+        'fl oz': 30.0,
+        'fluid ounce': 30.0,
+        'pint': 473.0,
+        'quart': 946.0,
+        'gallon': 3785.0,
+        'cm3': 1.0,
+        'cl': 10.0,
+        'dl': 100.0,
+        'l': 1000.0,
+        'liter': 1000.0,
+        'pinch': 0.36, # rough estimate
+        'dash': 0.72,
+    }
+
+    # Densities in grams per milliliter (g/ml)
+    PRODUCT_DENSITIES = {
+        # Baking and flours
+        'flour': 0.53,
+        'all-purpose flour': 0.53,
+        'wheat flour': 0.53,
+        'sugar': 0.85,
+        'white sugar': 0.85,
+        'granulated sugar': 0.85,
+        'powdered sugar': 0.50,
+        'icing sugar': 0.50,
+        'brown sugar': 0.83,
+        'salt': 1.20,
+        'table salt': 1.20,
+        'baking powder': 0.90,
+        'baking soda': 1.10,
+        'cocoa powder': 0.42,
+        
+        # Liquids
+        'water': 1.0,
+        'milk': 1.03,
+        'heavy cream': 0.99,
+        'vegetable oil': 0.92,
+        'olive oil': 0.92,
+        'honey': 1.42,
+        'maple syrup': 1.32,
+        'butter': 0.96, # melted or solid approx
+        'melted butter': 0.94,
+        
+        # Grains and dry goods
+        'rice': 0.85,
+        'white rice': 0.85,
+        'oats': 0.38,
+        'rolled oats': 0.38,
+        'quinoa': 0.72,
+        'couscous': 0.72,
+        'lentils': 0.85,
+        
+        # Condiments
+        'ketchup': 1.15,
+        'mustard': 1.05,
+        'mayonnaise': 0.95,
+        'peanut butter': 1.08,
+        
+        # Default density for unknown items (approximate density of water/mixed food)
+        'default': 1.0
+    }
+    
+    # Direct weight conversions (already in weight, just need unit conversion)
+    WEIGHT_UNITS_G = {
+        'g': 1.0,
+        'gram': 1.0,
+        'kg': 1000.0,
+        'kilo': 1000.0,
+        'kilogram': 1000.0,
+        'oz': 28.35,
+        'ounce': 28.35,
+        'lb': 453.59,
+        'pound': 453.59,
+        'mg': 0.001
+    }
+
+    @classmethod
+    def get_density(cls, product_name):
+        if not product_name:
+            return cls.PRODUCT_DENSITIES['default']
+            
+        product_name = str(product_name).lower().strip()
+        
+        # Exact match
+        if product_name in cls.PRODUCT_DENSITIES:
+            return cls.PRODUCT_DENSITIES[product_name]
+            
+        # Partial match
+        for key, density in cls.PRODUCT_DENSITIES.items():
+            if key in product_name:
+                return density
+                
+        return cls.PRODUCT_DENSITIES['default']
+
+    @classmethod
+    def convert_to_grams(cls, amount, unit, product_name=None):
+        """
+        Converts an amount and unit of a specific product to grams.
+        """
+        unit = str(unit).lower().strip()
+        
+        # If it's already a weight unit, simple scalar conversion
+        for w_unit, g_factor in cls.WEIGHT_UNITS_G.items():
+            if unit == w_unit or unit == f"{w_unit}s":
+                return amount * g_factor
+                
+        # If it's a volumetric unit, use density
+        volume_ml = None
+        for v_unit, ml_factor in cls.VOLUME_UNITS_ML.items():
+            if unit == v_unit or unit == f"{v_unit}s":
+                volume_ml = amount * ml_factor
+                break
+                
+        if volume_ml is not None:
+            density = cls.get_density(product_name)
+            return volume_ml * density
+            
+        # Unrecognized unit
+        return None
+
+    @classmethod
+    def parse_and_convert(cls, recipe_string, product_name=None):
+        """
+        Parses a string like "1.5 cups" or "2 tbsp" and converts to grams.
+        """
+        # Match number (including decimals/fractions roughly) followed by text
+        match = re.match(r'^([\d\.]+)\s*([a-zA-Z\s]+)$', str(recipe_string).strip())
+        if match:
+            try:
+                amount = float(match.group(1))
+                unit = match.group(2).strip()
+                result = cls.convert_to_grams(amount, unit, product_name)
+                if result is not None:
+                    return round(result, 2)
+            except ValueError:
+                pass
+        return None
+
+if __name__ == '__main__':
+    # Tests
+    print("1 cup of all-purpose flour:", UnitConverter.parse_and_convert("1 cup", "all-purpose flour"), "g")
+    print("1 cup of white sugar:", UnitConverter.parse_and_convert("1 cup", "white sugar"), "g")
+    print("1 cup of water:", UnitConverter.parse_and_convert("1 cup", "water"), "g")
+    print("2 tbsp of olive oil:", UnitConverter.parse_and_convert("2 tbsp", "olive oil"), "g")
+    print("1 pound of generic food:", UnitConverter.parse_and_convert("1 pound", "unknown"), "g")
+    print("1 pinch of salt:", UnitConverter.parse_and_convert("1 pinch", "salt"), "g")