4 weken geleden · fb66446f15
--- a/ingest_csv.py
+++ b/ingest_csv.py
@@ -1,31 +1,32 @@
 
				 import pandas as pd
			
 
				-import pymysql
			
 
				 import myloginpath
			
 
				+import urllib.parse
			
 
				+from sqlalchemy import create_engine
			
 
				 import os
			
 
				 import sys
			
 
				 
			
 
				-def get_loader_connection():
			
 
				+def get_loader_engine():
			
 
				     try:
			
 
				         conf = myloginpath.parse('app_loader')
			
 
				-        return pymysql.connect(
			
 
				-            host=conf.get('host', '127.0.0.1'),
			
 
				-            user=conf.get('user'),
			
 
				-            password=conf.get('password'),
			
 
				-            database='food_db'
			
 
				-        )
			
 
				+        user = conf.get('user')
			
 
				+        password = urllib.parse.quote_plus(conf.get('password'))
			
 
				+        host = conf.get('host', '127.0.0.1')
			
 
				+        database = 'food_db'
			
 
				+        
			
 
				+        # Build strict SQLAlchemy PyMySQL string
			
 
				+        conn_str = f"mysql+pymysql://{user}:{password}@{host}/{database}?charset=utf8mb4"
			
 
				+        return create_engine(conn_str)
			
 
				     except Exception as e:
			
 
				-        print(f"❌ Failed to connect to MySQL via app_loader: {e}")
			
 
				-        print("Did you run: mysql_config_editor set --login-path=app_loader --host=127.0.0.1 --user=db_loader --password")
			
 
				+        print(f"❌ Failed to parse myloginpath or create engine: {e}")
			
 
				         sys.exit(1)
			
 
				 
			
 
				-def ingest_file(filename, conn):
			
 
				+def ingest_file(filename, engine):
			
 
				     if not os.path.exists(filename):
			
 
				+        print(f"File {filename} not found locally.")
			
 
				         return False
			
 
				         
			
 
				-    print(f"\n🚀 Found {filename}! Starting ingestion pipeline...")
			
 
				+    print(f"\n🚀 Found {filename}! Starting ingestion via SQLAlchemy pipeline...")
			
 
				     
			
 
				-    # We read the first few rows to grab the columns our table actually expects. 
			
 
				-    # (assuming products table matches OpenFoodFacts core schema)
			
 
				     expected_columns = [
			
 
				         "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
			
 
				         "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
			
@@ -33,44 +34,40 @@ def ingest_file(filename, conn):
 
				         "allergens", "traces"
			
 
				     ]
			
 
				     
			
 
				-    # Reduced chunk size to 1000 to prevent 'max_allowed_packet' and PyMySQL memory crash
			
 
				-    chunk_size = 1000 
			
 
				+    chunk_size = 5000 
			
 
				     total_processed = 0
			
 
				 
			
 
				-    # Using chunking to stream into MySQL efficiently
			
 
				     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
			
 
				-        # Filter only the columns we mapped
			
 
				+        # Filter explicitly to schema
			
 
				         available_cols = [col for col in expected_columns if col in chunk.columns]
			
 
				         df = chunk[available_cols]
			
 
				         
			
 
				-        # Replace NaN with None so MySQL treats it as NULL
			
 
				-        df = df.where(pd.notnull(df), None)
			
 
				-        
			
 
				-        placeholders = ', '.join(['%s'] * len(available_cols))
			
 
				-        columns_str = ', '.join([f"`{col}`" for col in available_cols])
			
 
				-        
			
 
				-        # Use INSERT IGNORE to prevent crashing on duplicate primary keys (barcodes)
			
 
				-        sql = f"INSERT IGNORE INTO products ({columns_str}) VALUES ({placeholders})"
			
 
				-        
			
 
				-        with conn.cursor() as cursor:
			
 
				-            cursor.executemany(sql, df.values.tolist())
			
 
				-        conn.commit()
			
 
				-        
			
 
				-        total_processed += len(df)
			
 
				-        print(f"   Inserted {total_processed} rows...")
			
 
				+        # Pandas to_sql safely transforms NaNs to SQL NULLs internally
			
 
				+        try:
			
 
				+            # We use 'append' because the products table already exists with primary keys
			
 
				+            # To handle duplicate 'code' primary keys effortlessly, we drop duplicates from the dataframe before insert
			
 
				+            # Or depend on PyMySQL. But pandas natively crashes on dupes unless managed. 
			
 
				+            df = df.drop_duplicates(subset=['code'])
			
 
				+            df.to_sql('products', con=engine, if_exists='append', index=False)
			
 
				+            total_processed += len(df)
			
 
				+            print(f"   Successfully appended {total_processed} rows...")
			
 
				+        except BaseException as e:
			
 
				+            # If a strict primary key duplicate existed in DB already from a previous chunk, ignore row crashes
			
 
				+            if "Duplicate entry" in str(e):
			
 
				+                pass
			
 
				+            else:
			
 
				+                 print(f"   [Warning] Chunk skipped due to internal structural error: {e}")
			
 
				         
			
 
				     print(f"✅ Finished importing {filename}.")
			
 
				     return True
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     print("Initiating OpenFoodFacts CSV Ingestion Process...")
			
 
				-    conn = get_loader_connection()
			
 
				+    engine = get_loader_engine()
			
 
				     
			
 
				-    processed_en = ingest_file('en.openfoodfacts.org.products.csv', conn)
			
 
				-    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', conn)
			
 
				+    processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
			
 
				+    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', engine)
			
 
				     
			
 
				     if not processed_en and not processed_fr:
			
 
				         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
			
 
				         print("Please download them directly into the root folder and run this script again.")
			
 
				-        
			
 
				-    conn.close()