utils_data.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import pandas as pd
import numpy as np

def df_info(df: pd.DataFrame,
            show_rows: int = 0,
            horizontal: bool = True,
            percentiles: tuple = (.25, .5, .75),
            includes: tuple = (bool, int, float, np.object, pd.Categorical, np.datetime64, np.timedelta64),
            excludes: tuple = (),
            selected_cols: list = None
            ):
    """

    :param df: pd.DataFrame object which should be analysed
    :param show_rows: number of rows to display, if -1 all rows will be displayed
    :param horizontal: horizontal or vertical printing (cols <-> rows)
    :param percentiles: percentile ranks of given data to show
    :param includes: data types which should be included into analysis,
        valid values: bool, int, float, np.object, pd.Categorical, np.datetime64, np.timedelta64
    :param excludes: data types which should be excluded from analysis,
        valid values: bool, int, float, np.object, pd.Categorical, np.datetime64, np.timedelta64
    :param selected_cols: specify columns for analysis (if selected the includes and excludes arguments are disabled)
    :return: Pandas DataFrame object containing information about given DataFrame
    """
    df = df.copy()

    if show_rows == -1:
        show_rows = len(df)

    # you can either exclude some data types or specify columns to
    # reduce the numbers of columns which are being analyzed
    if selected_cols:
        df = df.loc[:, selected_cols]
    else:
        includes = tuple(set(includes).difference(excludes))
        for col in df:
            if not isinstance(col, includes):
                df.drop([col], axis=1, inplace=True)

    if len(df.columns) == 0: raise ValueError('DataFrame is empty!')

    # data types
    types = pd.DataFrame(df.dtypes, columns=["dtype"])

    # count missing values
    nans = pd.DataFrame(df.isnull().sum(), columns=["missing"])

    # description of the dataframe (mean, median, std, min-max values, frequency, uniques etc)
    descriptions = [df[col].describe(percentiles=percentiles, include="all").drop(["count"]) for col in df]
    descriptions = pd.concat(descriptions, axis=1, sort=False)

    # show the first few rows depending on how many you want to show
    head = df.head(show_rows)

    # display resulting df either vertically or horizontally
    if horizontal:
        info_df = pd.concat([types, nans, descriptions.T, head.T], axis=1, sort=False)
    else:
        info_df = pd.concat([types.T, nans.T, descriptions, head], axis=0, sort=False)

    # show all rows and columns, no matter how large the actual dataframe is
    with pd.option_context("display.max_rows", None, "display.max_columns", None):
        # do not permanently change pandas settings -> with statement
        try:
            from IPython.core.display import display
            display(info_df)
        except ImportError:
            print(info_df)