from pyspark.sql import SparkSession from pyspark.sql.functions import input_file_name, col, size, length import math import time def analyze_dataframe_and_get_optimal_config(df, operation_type="read", target_cluster_cores=None, target_cluster_memory_gb=None): """ Comprehensive DataFrame analysis with optimal Spark configuration recommendations Args: df: PySpark DataFrame to analyze operation_type: Type of operation ("read", "write", "transform", "ml", "join") target_cluster_cores: Total cores available in cluster (if known) ...