Creating long dataframes from puffy tables

This tutorial will show the basic features of using the puffbird.puffy_to_long method

[1]:
import numpy as np
import pandas as pd
import puffbird as pb

A weirdly complex table

First, we will create a puffy dataframe as an example:

[2]:
df = pd.DataFrame({
    # a and c have the same data repeated three times
    # b is just a bunch of numpy arrays of the same shapes
    # d is also just a bunch of numpy arrays of different shapes
    # e contains various pandas DataFrames with the same column structures
    #   and the same index format.
    # f contains various pandas DataFrames with different structures
    # g contains mixed data types
    # missing data is also included
    'a': [
        'aa', 'bb', 'cc', 'dd',
        'aa', 'bb', 'cc', 'dd',
        'aa', 'bb', 'cc', 'dd'
    ],
    'b': [
        np.random.random((10, 5)),
        np.nan,
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5)),
        np.random.random((10, 5))
    ],
    'c': [
        {'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
        {'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
        {'dicta':[12,67], 'dictb':(4,5), 'dictc':{'key3':1, 'key2':77}},
        {'dicta':[1,23], 'dictb':3, 'dictc':{'key1':55, 'key2':33}},
        {'dicta':123, 'dictb':'words', 'dictc':{'key1':4, 'key2':2}},
        {'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
        {'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
        {'dicta':[12,67], 'dictb':(4,5), 'dictc':{'key3':1, 'key2':77}},
        {'dicta':[1,23], 'dictb':3, 'dictc':{'key1':55, 'key2':33}},
        {'dicta':123, 'dictb':'words', 'dictc':{'key1':4, 'key2':2}},
        {'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
        {'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
    ],
    'd': [
        np.random.random((16, 5)),
        np.random.random((18, 5)),
        np.random.random((19, 5)),
        np.random.random((11, 5)),
        np.random.random((12, 5)),
        np.random.random((14, 5)),
        np.random.random((17, 5)),
        np.random.random((110, 5)),
        None,
        np.random.random((2, 5)),
        np.random.random((4, 5)),
        np.random.random((7, 5))
    ],
    'e': [
        pd.DataFrame(
            {'c1':[1,2,3], 'c2':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[1,2,3,4], 'c2':[1,2,3,4]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        np.nan,
        pd.DataFrame(
            {'c1':[1,2,3], 'c2':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[1,2,3,4], 'c2':[1,2,3,4]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        np.nan,
        pd.DataFrame(
            {'c1':[1,2,3], 'c2':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[1,2,3,4], 'c2':[1,2,3,4], 'c3':[1,2,3,4]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        np.nan,
    ],
    'f': [
        pd.DataFrame(
            {'f1':[1,2,3], 'hh2':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c'], ['f', 'f', 'f']],
                names=['f', 'b', 'e']
            )
        ),
        pd.DataFrame(
            {'hh1':[1,2,3,4], 'qq2':[1,2,3,4]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'q1':[3,4,3], 'qq2':[3,5,3], 'c3':[1,2,3], 'c4':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c'], ['t', 't', 't']],
                names=['y', 'll', 'tt']
            )
        ),
        np.nan,
        pd.DataFrame(
            {'qq1':[1,2,3], 'rr2':[1,2,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            [[1,2,3,4], [1,2,3,4]],
            columns=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['rr', 'b']
            ),
            index=pd.MultiIndex.from_arrays(
                [[(1,2), (2,3)], ['a', 'b']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'cpp1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'rr']
            )
        ),
        np.nan,
        pd.DataFrame(
            {'sr1':[1,2,3,4], 'c2':[1,2,3,4]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'cpp1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['a', 'b']
            )
        ),
        pd.DataFrame(
            {'c1':[3,4,3], 'c2':[3,5,3]},
            index=pd.MultiIndex.from_arrays(
                [['a', 'b', 'c'], ['a', 'b', 'c']],
                names=['mm', 'b']
            )
        ),
        np.nan,
    ],
    'g': [
        'a', 'b', {'ff':'gg'}, {'a', 'b', 'c'},
        ('r',), pd.Series({'a':'b'}), 'a', 'b',
        1, 2, 3, 4
    ]
})

df
[2]:
a b c d e f g
0 aa [[0.9657556404566287, 0.6105982811179597, 0.71... {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... [[0.8332176695108217, 0.789958044060405, 0.294... c1 c2 a b a a 1 1 b b 2 ... f1 hh2 f b e a a f 1 1 b... a
1 bb NaN {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... [[0.3132246893884286, 0.1335576065684959, 0.42... c1 c2 a b a a 1 1 b b 2 ... hh1 qq2 a b a a 1 1 b b ... b
2 cc [[0.2978295126291556, 0.2876008891935764, 0.85... {'dicta': [12, 67], 'dictb': (4, 5), 'dictc': ... [[0.06184828264219733, 0.18545094706008847, 0.... c1 c2 a b a a 3 3 b b 4 ... q1 qq2 c3 c4 y ll tt ... {'ff': 'gg'}
3 dd [[0.4963928696663038, 0.7239167468580807, 0.98... {'dicta': [1, 23], 'dictb': 3, 'dictc': {'key1... [[0.8993257213551965, 0.2031570590614975, 0.66... NaN NaN {a, c, b}
4 aa [[0.3031081932962224, 0.6450296792517578, 0.32... {'dicta': 123, 'dictb': 'words', 'dictc': {'ke... [[0.5945125943356216, 0.15692780064527623, 0.0... c1 c2 a b a a 1 1 b b 2 ... qq1 rr2 a b a a 1 1 b b ... (r,)
5 bb [[0.5167332905196971, 0.7313676911394652, 0.58... {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... [[0.6826944216406796, 0.06614703871524041, 0.4... c1 c2 a b a a 1 1 b b 2 ... rr a b c d b a b c d a ... a b dtype: object
6 cc [[0.708744333480539, 0.8321196509452965, 0.132... {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... [[0.6154355882964945, 0.5022137513822291, 0.64... c1 c2 a b a a 3 3 b b 4 ... cpp1 c2 a rr a a 3 3 b... a
7 dd [[0.38385169069805636, 0.4351602576907261, 0.2... {'dicta': [12, 67], 'dictb': (4, 5), 'dictc': ... [[0.252611789308698, 0.04665515687154631, 0.04... NaN NaN b
8 aa [[0.8448759939899335, 0.3957149482929728, 0.70... {'dicta': [1, 23], 'dictb': 3, 'dictc': {'key1... None c1 c2 a b a a 1 1 b b 2 ... sr1 c2 a b a a 1 1 b b ... 1
9 bb [[0.19421217606406949, 0.7404434981173305, 0.5... {'dicta': 123, 'dictb': 'words', 'dictc': {'ke... [[0.7887921954495609, 0.07707935123200094, 0.5... c1 c2 c3 a b a a 1 1 ... cpp1 c2 a b a a 3 3 b b ... 2
10 cc [[0.5360890315477351, 0.11323478357643058, 0.7... {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... [[0.09228121163345293, 0.02214142758664739, 0.... c1 c2 a b a a 3 3 b b 4 ... c1 c2 mm b a a 3 3 b b ... 3
11 dd [[0.22093473502397243, 0.024389277765600403, 0... {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... [[0.5195772094908402, 0.2494521739025667, 0.39... NaN NaN 4

So this dataframe is quite daunting, I like to call it a puffy table.

Exploding the data out that are in puffy tables

Now with puffy_to_long you can easily unravel this dataframe. Since this dataframe is weirdly constructed puffy_to_long may take a while:

[3]:
long_df = pb.puffy_to_long(df)
long_df.head()
[3]:
index_level0 a b_level0 b_level1 b c_level0 c_level1 c d_level0 d_level1 d e_level0_a e_level0_b e_level0_2 e f_level0_0 f_level0_1 f g_level0 g
0 0 aa 0.0 0.0 0.965756 dicta 0 1 0.0 0.0 0.833218 a a c1 1.0 0.0 b a NaN a
1 0 aa 0.0 0.0 0.965756 dicta 0 1 0.0 0.0 0.833218 a a c1 1.0 0.0 e f NaN a
2 0 aa 0.0 0.0 0.965756 dicta 0 1 0.0 0.0 0.833218 a a c1 1.0 0.0 f a NaN a
3 0 aa 0.0 0.0 0.965756 dicta 0 1 0.0 0.0 0.833218 a a c1 1.0 0.0 f1 1 NaN a
4 0 aa 0.0 0.0 0.965756 dicta 0 1 0.0 0.0 0.833218 a a c1 1.0 0.0 hh2 1 NaN a

Now we have a dataframe with only hashable elements. puffy_to_long iteratively exploded all cells and treated each column individually. For example, if a cell contains a numpy array that is two-dimensional then the column will be exploded twice and two new columns will be added that are called *[COLUMN_NAME]_level0* and *[COLUMN_NAME]_level1*. For our column b, we get two new columns called b_level0 and b_level1. These levels will contain the index corresponding to the data point in the long-format of column b. Let’s try puffy_to_long again but just on column b.

Exploding numpy.array-containing columns

[4]:
long_df = pb.puffy_to_long(df, 'b')
long_df.head()
[4]:
index_level0 b_level0 b_level1 b
0 0 0 0 0.965756
1 0 0 1 0.610598
2 0 0 2 0.710187
3 0 0 3 0.851220
4 0 0 4 0.264982

index_level0 is the previous index from our dataframe. If this were a pandas.MultiIndex, we would have multiple columns instead of just one that corresponds to the old index of the dataframe.

Now, we could take this normal dataframe objects and perform various operations that we would normally want to perform, e.g.:

[5]:
long_df.groupby('b_level0')['b'].mean()
[5]:
b_level0
0    0.489353
1    0.538500
2    0.502751
3    0.490095
4    0.505218
5    0.501370
6    0.474048
7    0.573405
8    0.491318
9    0.516571
Name: b, dtype: float64

Let’s say we want to explode both column b and column d that both contain numpy arrays. Imagine that we want to align the axis 1 of the data in b and d, and call this axes aligned_axis, we can do this with puffy_to_long keyword arguments:

[6]:
long_df = pb.puffy_to_long(df, 'b', 'd', aligned_axis={'b':1, 'd':1})
long_df.head()
[6]:
index_level0 b_level0 aligned_axis b d_level0 d
0 0 0.0 0 0.965756 0.0 0.833218
1 0 0.0 0 0.965756 1.0 0.060445
2 0 0.0 0 0.965756 2.0 0.620360
3 0 0.0 0 0.965756 3.0 0.610501
4 0 0.0 0 0.965756 4.0 0.671548

So now, axis 1 of b and d column data are aligned and those indices are defined in the aligned_axis column. Since both columns contain missing cells, some index_level0 values have missing b and b_level0 columns or missing d_level0 and d columns. You can view these easily using standard pandas functionality:

[7]:
long_df.loc[long_df['b_level0'].isnull()].head()
[7]:
index_level0 b_level0 aligned_axis b d_level0 d
10650 1 NaN 0 NaN 0.0 0.313225
10651 1 NaN 0 NaN 1.0 0.630630
10652 1 NaN 0 NaN 2.0 0.074653
10653 1 NaN 0 NaN 3.0 0.290296
10654 1 NaN 0 NaN 4.0 0.563310
[8]:
long_df.loc[long_df['d_level0'].isnull()].head()
[8]:
index_level0 b_level0 aligned_axis b d_level0 d
9950 8 0.0 0 0.844876 NaN NaN
9951 8 1.0 0 0.905452 NaN NaN
9952 8 2.0 0 0.567938 NaN NaN
9953 8 3.0 0 0.043224 NaN NaN
9954 8 4.0 0 0.736017 NaN NaN

Exploding pandas.DataFrame-containing columns

Let’s take a look at how pandas.DataFrame objects are handled within puffy_to_long by taking a look at column e:

[9]:
long_df = pb.puffy_to_long(df, 'e')
long_df.head()
[9]:
index_level0 e_level0_a e_level0_b e_level0_2 e
0 0 a a c1 1.0
1 0 a a c2 1.0
2 0 b b c1 2.0
3 0 b b c2 2.0
4 0 c c c1 3.0

pandas.DataFrame objects are handled within one explosion iteration, unless the cell within that dataframe are non-hashable. This is why all new columns contain level0. The first two new columns e_level0_a and e_level0_b correspond to the pandas.MultiIndex index defined in all dataframes within this column. e_level0_2 corresponds to all the columns names of the dataframe. e only contains the data within each cell of each dataframe.

Let’s say we don’t want to unravel our columns in this way but instead just concatenate them all together. We can use the expand_cols argument for this, which expects a list of column names that contain only pandas.DataFrame or pandas.Series objects:

[10]:
long_df = pb.puffy_to_long(df, 'e', expand_cols=['e'])
long_df.head()
[10]:
index_level0 a b e_c1 e_c2 e_c3
0 0 a a 1 1 NaN
1 0 b b 2 2 NaN
2 0 c c 3 3 NaN
3 1 a a 1 1 NaN
4 1 b b 2 2 NaN

Here we preserved the columns of each dataframe in each cell and simply concatenated the dataframes together with the index_level0 information preserved. What if we use this method while also exploding column a, since this has the same column name as the column in the dataframes within column e:

[11]:
long_df = pb.puffy_to_long(df, 'a', 'e', expand_cols=['e'])
long_df.head()
[11]:
index_level0 a a_e b e_c1 e_c2 e_c3
0 0 aa a a 1.0 1.0 NaN
1 0 aa b b 2.0 2.0 NaN
2 0 aa c c 3.0 3.0 NaN
3 1 bb a a 1.0 1.0 NaN
4 1 bb b b 2.0 2.0 NaN

Since the column a already existed, the column a within each dataframe within column e was renamed to a_e.

Of course, similarly, this is handled with column b:

[12]:
long_df = pb.puffy_to_long(df, 'b', 'e', expand_cols=['e'])
long_df.head()
[12]:
index_level0 b_level0 b_level1 b a b_e e_c1 e_c2 e_c3
0 0 0.0 0.0 0.965756 a a 1.0 1.0 NaN
1 0 0.0 0.0 0.965756 b b 2.0 2.0 NaN
2 0 0.0 0.0 0.965756 c c 3.0 3.0 NaN
3 0 0.0 1.0 0.610598 a a 1.0 1.0 NaN
4 0 0.0 1.0 0.610598 b b 2.0 2.0 NaN

Less structured dataframe-containing columns will result in more complex long-format dataframes:

[13]:
long_df = pb.puffy_to_long(df, 'f')
long_df.head()
[13]:
index_level0 f_level0_0 f_level0_1 f
0 0 0 b a
1 0 0 e f
2 0 0 f a
3 0 0 f1 1
4 0 0 hh2 1
[14]:
long_df = pb.puffy_to_long(df, 'f', expand_cols=['f'])
long_df.head()
[14]:
index_level0 level_1 f_f f_b f_e f_f1 f_hh2 f_a f_hh1 f_qq2 ... f_('a', 'a') f_('b', 'b') f_('c', 'c') f_('d', 'd') f_rr f_cpp1 f_c2 f_sr1 f_mm f_c1
0 0 0 a a f 1.0 1.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0 1 b b f 2.0 2.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0 2 c c f 3.0 3.0 NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1 0 NaN a NaN NaN NaN a 1.0 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1 1 NaN b NaN NaN NaN b 2.0 2.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 28 columns

Exploding dictionaries

The column c contains dictionaries with various data types that are in them. The puffy_to_long algorithm iteratively explodes all objects within the dictionaries:

[15]:
long_df = pb.puffy_to_long(df, 'c')
long_df.head()
[15]:
index_level0 c_level0 c_level1 c
0 0 dicta 0 1
1 0 dicta 1 2
2 0 dicta 2 3
3 0 dictb NaN 3
4 0 dictc key1 1

Since some values within dictionaries can be further exploded, while others cannot some levels/axes contain NaNs when the explosion iteration for that data type stopped (for a specific row).