ingest_csv.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import pandas as pd
  2. import myloginpath
  3. import urllib.parse
  4. from sqlalchemy import create_engine, text
  5. from sqlalchemy.types import VARCHAR, TEXT, DOUBLE
  6. import os
  7. import sys
  8. from snmp_notifier import notifier
  9. def get_loader_engine():
  10. try:
  11. conf = myloginpath.parse('app_loader')
  12. user = conf.get('user')
  13. password = urllib.parse.quote_plus(conf.get('password'))
  14. host = conf.get('host', '127.0.0.1')
  15. database = 'food_db'
  16. conn_str = f"mysql+pymysql://{user}:{password}@{host}/{database}?charset=utf8mb4"
  17. return create_engine(conn_str)
  18. except Exception as e:
  19. print(f"❌ Failed to parse myloginpath or create engine: {e}")
  20. sys.exit(1)
  21. def ingest_file(filename, engine):
  22. if not os.path.exists(filename):
  23. print(f"File {filename} not found locally.")
  24. return False
  25. print(f"\n🚀 Found {filename}! Starting grouped vertical partition ingestion...")
  26. chunk_size = 10000
  27. total_processed = 0
  28. # Define the groupings
  29. groups = {
  30. 'products_core': ['code', 'product_name', 'generic_name', 'brands', 'ingredients_text'],
  31. 'products_allergens': ['code', 'allergens'],
  32. 'products_macros': ['code', 'energy-kcal_100g', 'proteins_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'sodium_100g', 'salt_100g', 'cholesterol_100g'],
  33. 'products_vitamins': ['code', 'vitamin-a_100g', 'vitamin-b1_100g', 'vitamin-b2_100g', 'vitamin-pp_100g', 'vitamin-b6_100g', 'vitamin-b9_100g', 'vitamin-b12_100g', 'vitamin-c_100g', 'vitamin-d_100g', 'vitamin-e_100g', 'vitamin-k_100g'],
  34. 'products_minerals': ['code', 'calcium_100g', 'iron_100g', 'magnesium_100g', 'potassium_100g', 'zinc_100g']
  35. }
  36. # Pre-calculate what to read
  37. all_required_cols = list(set([col for cols in groups.values() for col in cols]))
  38. for chunk in pd.read_csv(filename, sep='\t', dtype=str, chunksize=chunk_size, on_bad_lines='skip', low_memory=False, encoding='utf-8'):
  39. try:
  40. # Drop rows with missing codes
  41. if 'code' not in chunk.columns:
  42. continue
  43. df = chunk.dropna(subset=['code']).drop_duplicates(subset=['code']).copy()
  44. # Ensure all required columns exist in the chunk (fill with None if missing)
  45. for col in all_required_cols:
  46. if col not in df.columns:
  47. df[col] = None
  48. for table_name, columns in groups.items():
  49. slice_df = df[columns].copy()
  50. # Cast datatypes: core and allergens are TEXT, others are DOUBLE
  51. if table_name in ['products_core', 'products_allergens']:
  52. sql_dtypes = {col: TEXT() for col in columns if col != 'code'}
  53. sql_dtypes['code'] = VARCHAR(50)
  54. else:
  55. # Convert to numeric (double) safely
  56. for col in columns:
  57. if col != 'code':
  58. slice_df[col] = pd.to_numeric(slice_df[col], errors='coerce')
  59. sql_dtypes = {col: DOUBLE() for col in columns if col != 'code'}
  60. sql_dtypes['code'] = VARCHAR(50)
  61. # Write to temp table
  62. temp_name = f"temp_{table_name}"
  63. slice_df.to_sql(temp_name, con=engine, if_exists='replace', index=False, dtype=sql_dtypes)
  64. # INSERT IGNORE into final table
  65. with engine.begin() as conn:
  66. cols_str = ", ".join([f"`{c}`" for c in columns])
  67. conn.execute(text(f"INSERT IGNORE INTO {table_name} ({cols_str}) SELECT {cols_str} FROM {temp_name}"))
  68. conn.execute(text(f"DROP TABLE IF EXISTS {temp_name}"))
  69. total_processed += len(df)
  70. print(f" Successfully appended {total_processed} rows into grouped tables...", end="\r")
  71. if total_processed % 50000 == 0:
  72. notifier.send_alert(f"Ingestion Milestone: {total_processed} rows processed")
  73. except BaseException as e:
  74. notifier.send_alert(f"Ingestion Exception: {str(e)}")
  75. print(f"\n [Warning] Chunk skipped due to error: {e}")
  76. notifier.send_alert(f"Ingestion Finished: {filename}")
  77. print(f"\n✅ Finished importing {filename}.")
  78. return True
  79. if __name__ == "__main__":
  80. print("Initiating OpenFoodFacts Grouped Vertical Ingestion Process...")
  81. engine = get_loader_engine()
  82. processed_en = ingest_file('en.openfoodfacts.org.products.csv', engine)
  83. processed_fr = ingest_file('fr.openfoodfacts.org.products.csv', engine)
  84. if not processed_en and not processed_fr:
  85. print("\n❌ Could not find CSVs.")
  86. else:
  87. print("\n🎉 Full database reload complete! Ready for AI RAG.")