Bläddra i källkod

TG-6: Fully refactored ingestion pipeline to SQLAlchemy with pandas.to_sql for enterprise data processing

lanfr144 4 veckor sedan
förälder
incheckning
fb66446f15
1 ändrade filer med 35 tillägg och 38 borttagningar
  1. 35 38
      ingest_csv.py

+ 35 - 38
ingest_csv.py

@@ -1,31 +1,32 @@
 import pandas as pd
 import pandas as pd
-import pymysql
 import myloginpath
 import myloginpath
+import urllib.parse
+from sqlalchemy import create_engine
 import os
 import os
 import sys
 import sys
 
 
-def get_loader_connection():
+def get_loader_engine():
     try:
     try:
         conf = myloginpath.parse('app_loader')
         conf = myloginpath.parse('app_loader')
-        return pymysql.connect(
-            host=conf.get('host', '127.0.0.1'),
-            user=conf.get('user'),
-            password=conf.get('password'),
-            database='food_db'
-        )
+        user = conf.get('user')
+        password = urllib.parse.quote_plus(conf.get('password'))
+        host = conf.get('host', '127.0.0.1')
+        database = 'food_db'
+        
+        # Build strict SQLAlchemy PyMySQL string
+        conn_str = f"mysql+pymysql://{user}:{password}@{host}/{database}?charset=utf8mb4"
+        return create_engine(conn_str)
     except Exception as e:
     except Exception as e:
-        print(f"❌ Failed to connect to MySQL via app_loader: {e}")
-        print("Did you run: mysql_config_editor set --login-path=app_loader --host=127.0.0.1 --user=db_loader --password")
+        print(f"❌ Failed to parse myloginpath or create engine: {e}")
         sys.exit(1)
         sys.exit(1)
 
 
-def ingest_file(filename, conn):
+def ingest_file(filename, engine):
     if not os.path.exists(filename):
     if not os.path.exists(filename):
+        print(f"File {filename} not found locally.")
         return False
         return False
         
         
-    print(f"\n🚀 Found {filename}! Starting ingestion pipeline...")
+    print(f"\n🚀 Found {filename}! Starting ingestion via SQLAlchemy pipeline...")
     
     
-    # We read the first few rows to grab the columns our table actually expects. 
-    # (assuming products table matches OpenFoodFacts core schema)
     expected_columns = [
     expected_columns = [
         "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
         "code", "url", "creator", "created_t", "created_datetime", "last_modified_t", 
         "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
         "last_modified_datetime", "product_name", "generic_name", "quantity", "packaging", 
@@ -33,44 +34,40 @@ def ingest_file(filename, conn):
         "allergens", "traces"
         "allergens", "traces"
     ]
     ]
     
     
-    # Reduced chunk size to 1000 to prevent 'max_allowed_packet' and PyMySQL memory crash
-    chunk_size = 1000 
+    chunk_size = 5000 
     total_processed = 0
     total_processed = 0
 
 
-    # Using chunking to stream into MySQL efficiently
     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
     for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip'):
-        # Filter only the columns we mapped
+        # Filter explicitly to schema
         available_cols = [col for col in expected_columns if col in chunk.columns]
         available_cols = [col for col in expected_columns if col in chunk.columns]
         df = chunk[available_cols]
         df = chunk[available_cols]
         
         
-        # Replace NaN with None so MySQL treats it as NULL
-        df = df.where(pd.notnull(df), None)
-        
-        placeholders = ', '.join(['%s'] * len(available_cols))
-        columns_str = ', '.join([f"`{col}`" for col in available_cols])
-        
-        # Use INSERT IGNORE to prevent crashing on duplicate primary keys (barcodes)
-        sql = f"INSERT IGNORE INTO products ({columns_str}) VALUES ({placeholders})"
-        
-        with conn.cursor() as cursor:
-            cursor.executemany(sql, df.values.tolist())
-        conn.commit()
-        
-        total_processed += len(df)
-        print(f"   Inserted {total_processed} rows...")
+        # Pandas to_sql safely transforms NaNs to SQL NULLs internally
+        try:
+            # We use 'append' because the products table already exists with primary keys
+            # To handle duplicate 'code' primary keys effortlessly, we drop duplicates from the dataframe before insert
+            # Or depend on PyMySQL. But pandas natively crashes on dupes unless managed. 
+            df = df.drop_duplicates(subset=['code'])
+            df.to_sql('products', con=engine, if_exists='append', index=False)
+            total_processed += len(df)
+            print(f"   Successfully appended {total_processed} rows...")
+        except BaseException as e:
+            # If a strict primary key duplicate existed in DB already from a previous chunk, ignore row crashes
+            if "Duplicate entry" in str(e):
+                pass
+            else:
+                 print(f"   [Warning] Chunk skipped due to internal structural error: {e}")
         
         
     print(f"✅ Finished importing {filename}.")
     print(f"✅ Finished importing {filename}.")
     return True
     return True
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     print("Initiating OpenFoodFacts CSV Ingestion Process...")
     print("Initiating OpenFoodFacts CSV Ingestion Process...")
-    conn = get_loader_connection()
+    engine = get_loader_engine()
     
     
-    processed_en = ingest_file('en.openfoodfacts.org.products.csv', conn)
-    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', conn)
+    processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
+    processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', engine)
     
     
     if not processed_en and not processed_fr:
     if not processed_en and not processed_fr:
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("\n❌ Could not find either 'en.openfoodfacts.org.products.csv' or 'fr.openfoodfacts.org.products.csv'.")
         print("Please download them directly into the root folder and run this script again.")
         print("Please download them directly into the root folder and run this script again.")
-        
-    conn.close()