lanfr144 4 недель назад
Родитель
Сommit
9c6abcff81
3 измененных файлов с 158 добавлено и 0 удалено
  1. 76 0
      convert_datatypes.py
  2. 76 0
      ingest_csv.py
  3. 6 0
      requirements.txt

+ 76 - 0
convert_datatypes.py

@@ -0,0 +1,76 @@
+import pymysql
+import pandas as pd
+import getpass
+
+def detect_and_convert_types():
+    print("Welcome to the Data Types Optimizer.")
+    print("WARNING: This modifies your database schemas. You must authenticate as the database `db_owner`.\n")
+    
+    owner_pass = getpass.getpass("Enter the MySQL 'db_owner' password: ")
+
+    try:
+        conn = pymysql.connect(
+            host='127.0.0.1',
+            user='db_owner',
+            password=owner_pass,
+            database='food_db'
+        )
+        cursor = conn.cursor()
+    except Exception as e:
+        print(f"❌ Connection failed: {e}")
+        return
+
+    # Assuming we check common known numerical columns to shrink the DB footprint
+    columns_to_inspect = ["quantity", "created_t", "last_modified_t"]
+
+    for col in columns_to_inspect:
+        print(f"\nAnalyzing column: `{col}`")
+        
+        try:
+            # Check if column exists by picking 5000 non nulls
+            query = f"SELECT `{col}` FROM products WHERE `{col}` IS NOT NULL AND `{col}` != '' LIMIT 5000"
+            df = pd.read_sql(query, conn)
+        except Exception as e:
+            print(f" ⚠️ Could not read column `{col}`: {e}")
+            continue
+            
+        if df.empty:
+            print(f" ⏭️ Column `{col}` is entirely null/empty. Keeping as TEXT.")
+            continue
+            
+        series = df[col].astype(str).str.strip()
+        
+        # INTEGER CHECK
+        if series.str.match(r'^-?\d+$').all():
+            print(f" ⚙️ Status: ALL INTS matched. Converting `{col}` to BIGINT.")
+            try:
+                cursor.execute(f"UPDATE products SET `{col}` = NULL WHERE `{col}` = '';")
+                cursor.execute(f"ALTER TABLE products MODIFY COLUMN `{col}` BIGINT;")
+                conn.commit()
+                print(" ✅ Success")
+            except Exception as e:
+                print(f" ❌ Failed to alter table: {e}")
+            continue
+            
+        # FLOAT CHECK
+        test_float = series.str.replace(',', '.')
+        if test_float.str.match(r'^-?\d*\.\d+$').any() and test_float.str.match(r'^-?\d*\.?\d+$').all():
+            print(f" ⚙️ Status: FLOATS detected. Standardizing and converting `{col}` to DOUBLE...")
+            try:
+                cursor.execute(f"UPDATE products SET `{col}` = NULL WHERE `{col}` = '';")
+                cursor.execute(f"UPDATE products SET `{col}` = REPLACE(`{col}`, ',', '.') WHERE `{col}` LIKE '%,%';")
+                cursor.execute(f"ALTER TABLE products MODIFY COLUMN `{col}` DOUBLE;")
+                conn.commit()
+                print(" ✅ Success")
+            except Exception as e:
+                print(f" ❌ Failed to alter table: {e}")
+            continue
+
+        print(f" ⏭️ Keeping `{col}` as TEXT.")
+
+    cursor.close()
+    conn.close()
+    print("\n🎉 Datatype conversion complete!")
+
+if __name__ == '__main__':
+    detect_and_convert_types()

+ 76 - 0
ingest_csv.py

@@ -0,0 +1,76 @@
+import pandas as pd
+import pymysql
+import myloginpath
+import os
+import sys
+
+def get_loader_connection():
+    try:
+        conf = myloginpath.parse('app_loader')
+        return pymysql.connect(
+            host=conf.get('host', '127.0.0.1'),
+            user=conf.get('user'),
+            password=conf.get('password'),
+            database='food_db',
+            local_infile=True
+        )
+    except Exception as e:
+        print(f"❌ Failed to connect to MySQL via app_loader: {e}")
+        print("Did you run: mysql_config_editor set --login-path=app_loader --host=127.0.0.1 --user=db_loader --password")
+        sys.exit(1)
+
+def ingest_file(filename, conn):
+    if not os.path.exists(filename):
+        return False
+        
+    print(f"\n🚀 Found {filename}! Starting ingestion pipeline...")
+    
+    # We read the first few rows to grab the columns our table actually expects. 
+    # (assuming products table matches OpenFoodFacts core schema)
+    expected_columns = [
+        "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
+        "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
+        "brands", "categories", "origins", "labels", "stores", "countries", "ingredients_text", 
+        "allergens", "traces"
+    ]
+    
+    chunk_size = 50000 
+    total_processed = 0
+
+    # Using chunking to stream into MySQL efficiently
+    for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
+        # Filter only the columns we mapped
+        available_cols = [col for col in expected_columns if col in chunk.columns]
+        df = chunk[available_cols]
+        
+        # Replace NaN with None so MySQL treats it as NULL
+        df = df.where(pd.notnull(df), None)
+        
+        placeholders = ', '.join(['%s'] * len(available_cols))
+        columns_str = ', '.join([f"`{col}`" for col in available_cols])
+        
+        # Use INSERT IGNORE to prevent crashing on duplicate primary keys (barcodes)
+        sql = f"INSERT IGNORE INTO products ({columns_str}) VALUES ({placeholders})"
+        
+        with conn.cursor() as cursor:
+            cursor.executemany(sql, df.values.tolist())
+        conn.commit()
+        
+        total_processed += len(df)
+        print(f"   Inserted {total_processed} rows...")
+        
+    print(f"✅ Finished importing {filename}.")
+    return True
+
+if __name__ == "__main__":
+    print("Initiating OpenFoodFacts CSV Ingestion Process...")
+    conn = get_loader_connection()
+    
+    processed_en = ingest_file('en.openfoodfacts.org.products.csv', conn)
+    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', conn)
+    
+    if not processed_en and not processed_fr:
+        print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
+        print("Please download them directly into the root folder and run this script again.")
+        
+    conn.close()

+ 6 - 0
requirements.txt

@@ -0,0 +1,6 @@
+pandas
+pymysql
+myloginpath
+streamlit
+ollama
+bcrypt