Source code for pyspark_analyzer.api

import pandas as pd
from pyspark.sql import DataFrame

from .logging import get_logger
from .profiler import profile_dataframe
from .sampling import SamplingConfig

logger = get_logger(__name__)


[docs] def analyze( df: DataFrame, *, sampling: bool | None = None, target_rows: int | None = None, fraction: float | None = None, columns: list[str] | None = None, output_format: str = "pandas", include_advanced: bool = True, include_quality: bool = True, seed: int | None = None, show_progress: bool | None = None, ) -> pd.DataFrame | dict | str: """ Analyze a PySpark DataFrame and generate comprehensive statistics. This is the simplified entry point for profiling DataFrames. It automatically handles sampling configuration based on the provided parameters. Note: Compatible with PySpark 3.0.0+. Uses native median() function when available (PySpark 3.4.0+) for better performance, with automatic fallback to percentile_approx for older versions. Args: df: PySpark DataFrame to analyze sampling: Whether to enable sampling. If None, auto-sampling is enabled for large datasets. If False, no sampling. If True, uses default sampling. target_rows: Sample to approximately this many rows. Mutually exclusive with fraction. fraction: Sample this fraction of the data (0.0-1.0). Mutually exclusive with target_rows. columns: List of specific columns to profile. If None, profiles all columns. output_format: Output format ("pandas", "dict", "json", "summary"). Default is "pandas". include_advanced: Include advanced statistics (skewness, kurtosis, outliers, etc.) include_quality: Include data quality metrics seed: Random seed for reproducible sampling show_progress: Show progress indicators during analysis. If None, auto-detects based on environment. Returns: Profile results in the requested format: - "pandas": pandas DataFrame with statistics - "dict": Python dictionary - "json": JSON string - "summary": Human-readable summary string Examples: >>> # Basic usage with auto-sampling >>> profile = analyze(df) >>> # Disable sampling >>> profile = analyze(df, sampling=False) >>> # Sample to 100,000 rows >>> profile = analyze(df, target_rows=100_000) >>> # Sample 10% of data >>> profile = analyze(df, fraction=0.1) >>> # Profile specific columns only >>> profile = analyze(df, columns=["age", "salary"]) >>> # Get results as dictionary >>> profile = analyze(df, output_format="dict") """ logger.info(f"Starting DataFrame analysis with output_format={output_format}") # Build sampling configuration sampling_config = _build_sampling_config( sampling=sampling, target_rows=target_rows, fraction=fraction, seed=seed, ) # Delegate to profiler try: result = profile_dataframe( dataframe=df, columns=columns, output_format=output_format, include_advanced=include_advanced, include_quality=include_quality, sampling_config=sampling_config, show_progress=show_progress, ) logger.info("DataFrame analysis completed successfully") return result except Exception as e: logger.error(f"Error during DataFrame analysis: {e!s}", exc_info=True) raise
def _build_sampling_config( sampling: bool | None, target_rows: int | None, fraction: float | None, seed: int | None, ) -> SamplingConfig: """ Build SamplingConfig from simplified parameters. Args: sampling: Whether to enable sampling target_rows: Target number of rows to sample fraction: Fraction of data to sample seed: Random seed Returns: SamplingConfig instance """ # If sampling is explicitly disabled if sampling is False: return SamplingConfig(enabled=False) # Let SamplingConfig handle validation return SamplingConfig( enabled=sampling if sampling is not None else True, target_rows=target_rows, fraction=fraction, seed=seed if seed is not None else 42, )