4 veckor sedan · fb66446f15
--- a/ingest_csv.py
+++ b/ingest_csv.py
@@ -1,31 +1,32 @@
 
															 import pandas as pd
														
 
															-import pymysql
														
 
															 import myloginpath
														
 
															+import urllib.parse
														
 
															+from sqlalchemy import create_engine
														
 
															 import os
														
 
															 import sys
														
 
															-def get_loader_connection():
														
 
															+def get_loader_engine():
														
 
															     try:
														
 
															         conf = myloginpath.parse('app_loader')
														
 
															-        return pymysql.connect(
														
 
															-            host=conf.get('host', '127.0.0.1'),
														
 
															-            user=conf.get('user'),
														
 
															-            password=conf.get('password'),
														
 
															-            database='food_db'
														
 
															-        )
														
 
															+        user = conf.get('user')
														
 
															+        password = urllib.parse.quote_plus(conf.get('password'))
														
 
															+        host = conf.get('host', '127.0.0.1')
														
 
															+        database = 'food_db'
														
 
															+        
														
 
															+        # Build strict SQLAlchemy PyMySQL string
														
 
															+        conn_str = f"mysql+pymysql://{user}:{password}@{host}/{database}?charset=utf8mb4"
														
 
															+        return create_engine(conn_str)
														
 
															     except Exception as e:
														
 
															-        print(f"❌ Failed to connect to MySQL via app_loader: {e}")
														
 
															-        print("Did you run: mysql_config_editor set --login-path=app_loader --host=127.0.0.1 --user=db_loader --password")
														
 
															+        print(f"❌ Failed to parse myloginpath or create engine: {e}")
														
 
															         sys.exit(1)
														
 
															-def ingest_file(filename, conn):
														
 
															+def ingest_file(filename, engine):
														
 
															     if not os.path.exists(filename):
														
 
															+        print(f"File {filename} not found locally.")
														
 
															         return False
														
 
															-    print(f"\n🚀 Found {filename}! Starting ingestion pipeline...")
														
 
															+    print(f"\n🚀 Found {filename}! Starting ingestion via SQLAlchemy pipeline...")
														
 
															-    # We read the first few rows to grab the columns our table actually expects. 
														
 
															-    # (assuming products table matches OpenFoodFacts core schema)
														
 
															     expected_columns = [
														
 
															         "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
														
 
															         "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
														
@@ -33,44 +34,40 @@ def ingest_file(filename, conn):
 
															         "allergens", "traces"
														
 
															     ]
														
 
															-    # Reduced chunk size to 1000 to prevent 'max_allowed_packet' and PyMySQL memory crash
														
 
															-    chunk_size = 1000 
														
 
															+    chunk_size = 5000 
														
 
															     total_processed = 0
														
 
															-    # Using chunking to stream into MySQL efficiently
														
 
															     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
														
 
															-        # Filter only the columns we mapped
														
 
															+        # Filter explicitly to schema
														
 
															         available_cols = [col for col in expected_columns if col in chunk.columns]
														
 
															         df = chunk[available_cols]
														
 
															-        # Replace NaN with None so MySQL treats it as NULL
														
 
															-        df = df.where(pd.notnull(df), None)
														
 
															-        
														
 
															-        placeholders = ', '.join(['%s'] * len(available_cols))
														
 
															-        columns_str = ', '.join([f"`{col}`" for col in available_cols])
														
 
															-        
														
 
															-        # Use INSERT IGNORE to prevent crashing on duplicate primary keys (barcodes)
														
 
															-        sql = f"INSERT IGNORE INTO products ({columns_str}) VALUES ({placeholders})"
														
 
															-        
														
 
															-        with conn.cursor() as cursor:
														
 
															-            cursor.executemany(sql, df.values.tolist())
														
 
															-        conn.commit()
														
 
															-        
														
 
															-        total_processed += len(df)
														
 
															-        print(f"   Inserted {total_processed} rows...")
														
 
															+        # Pandas to_sql safely transforms NaNs to SQL NULLs internally
														
 
															+        try:
														
 
															+            # We use 'append' because the products table already exists with primary keys
														
 
															+            # To handle duplicate 'code' primary keys effortlessly, we drop duplicates from the dataframe before insert
														
 
															+            # Or depend on PyMySQL. But pandas natively crashes on dupes unless managed. 
														
 
															+            df = df.drop_duplicates(subset=['code'])
														
 
															+            df.to_sql('products', con=engine, if_exists='append', index=False)
														
 
															+            total_processed += len(df)
														
 
															+            print(f"   Successfully appended {total_processed} rows...")
														
 
															+        except BaseException as e:
														
 
															+            # If a strict primary key duplicate existed in DB already from a previous chunk, ignore row crashes
														
 
															+            if "Duplicate entry" in str(e):
														
 
															+                pass
														
 
															+            else:
														
 
															+                 print(f"   [Warning] Chunk skipped due to internal structural error: {e}")
														
 
															     print(f"✅ Finished importing {filename}.")
														
 
															     return True
														
 
															 if __name__ == "__main__":
														
 
															     print("Initiating OpenFoodFacts CSV Ingestion Process...")
														
 
															-    conn = get_loader_connection()
														
 
															+    engine = get_loader_engine()
														
 
															-    processed_en = ingest_file('en.openfoodfacts.org.products.csv', conn)
														
 
															-    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', conn)
														
 
															+    processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
														
 
															+    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', engine)
														
 
															     if not processed_en and not processed_fr:
														
 
															         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
														
 
															         print("Please download them directly into the root folder and run this script again.")
														
 
															-        
														
 
															-    conn.close()