Creating long dataframes from puffy tables¶
This tutorial will show the basic features of using the puffbird.puffy_to_long
method
[1]:
import numpy as np
import pandas as pd
import puffbird as pb
A weirdly complex table¶
First, we will create a puffy dataframe as an example:
[2]:
df = pd.DataFrame({
# a and c have the same data repeated three times
# b is just a bunch of numpy arrays of the same shapes
# d is also just a bunch of numpy arrays of different shapes
# e contains various pandas DataFrames with the same column structures
# and the same index format.
# f contains various pandas DataFrames with different structures
# g contains mixed data types
# missing data is also included
'a': [
'aa', 'bb', 'cc', 'dd',
'aa', 'bb', 'cc', 'dd',
'aa', 'bb', 'cc', 'dd'
],
'b': [
np.random.random((10, 5)),
np.nan,
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5)),
np.random.random((10, 5))
],
'c': [
{'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
{'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
{'dicta':[12,67], 'dictb':(4,5), 'dictc':{'key3':1, 'key2':77}},
{'dicta':[1,23], 'dictb':3, 'dictc':{'key1':55, 'key2':33}},
{'dicta':123, 'dictb':'words', 'dictc':{'key1':4, 'key2':2}},
{'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
{'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
{'dicta':[12,67], 'dictb':(4,5), 'dictc':{'key3':1, 'key2':77}},
{'dicta':[1,23], 'dictb':3, 'dictc':{'key1':55, 'key2':33}},
{'dicta':123, 'dictb':'words', 'dictc':{'key1':4, 'key2':2}},
{'dicta':[1,2,3], 'dictb':3, 'dictc':{'key1':1, 'key2':2}},
{'dicta':[52,3], 'dictb':[3,4], 'dictc':{'key4':1, 'key2':2}},
],
'd': [
np.random.random((16, 5)),
np.random.random((18, 5)),
np.random.random((19, 5)),
np.random.random((11, 5)),
np.random.random((12, 5)),
np.random.random((14, 5)),
np.random.random((17, 5)),
np.random.random((110, 5)),
None,
np.random.random((2, 5)),
np.random.random((4, 5)),
np.random.random((7, 5))
],
'e': [
pd.DataFrame(
{'c1':[1,2,3], 'c2':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[1,2,3,4], 'c2':[1,2,3,4]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
np.nan,
pd.DataFrame(
{'c1':[1,2,3], 'c2':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[1,2,3,4], 'c2':[1,2,3,4]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
np.nan,
pd.DataFrame(
{'c1':[1,2,3], 'c2':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[1,2,3,4], 'c2':[1,2,3,4], 'c3':[1,2,3,4]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
np.nan,
],
'f': [
pd.DataFrame(
{'f1':[1,2,3], 'hh2':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c'], ['f', 'f', 'f']],
names=['f', 'b', 'e']
)
),
pd.DataFrame(
{'hh1':[1,2,3,4], 'qq2':[1,2,3,4]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['a', 'b']
)
),
pd.DataFrame(
{'q1':[3,4,3], 'qq2':[3,5,3], 'c3':[1,2,3], 'c4':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c'], ['t', 't', 't']],
names=['y', 'll', 'tt']
)
),
np.nan,
pd.DataFrame(
{'qq1':[1,2,3], 'rr2':[1,2,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
pd.DataFrame(
[[1,2,3,4], [1,2,3,4]],
columns=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['rr', 'b']
),
index=pd.MultiIndex.from_arrays(
[[(1,2), (2,3)], ['a', 'b']],
names=['a', 'b']
)
),
pd.DataFrame(
{'cpp1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'rr']
)
),
np.nan,
pd.DataFrame(
{'sr1':[1,2,3,4], 'c2':[1,2,3,4]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd']],
names=['a', 'b']
)
),
pd.DataFrame(
{'cpp1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['a', 'b']
)
),
pd.DataFrame(
{'c1':[3,4,3], 'c2':[3,5,3]},
index=pd.MultiIndex.from_arrays(
[['a', 'b', 'c'], ['a', 'b', 'c']],
names=['mm', 'b']
)
),
np.nan,
],
'g': [
'a', 'b', {'ff':'gg'}, {'a', 'b', 'c'},
('r',), pd.Series({'a':'b'}), 'a', 'b',
1, 2, 3, 4
]
})
df
[2]:
a | b | c | d | e | f | g | |
---|---|---|---|---|---|---|---|
0 | aa | [[0.9657556404566287, 0.6105982811179597, 0.71... | {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... | [[0.8332176695108217, 0.789958044060405, 0.294... | c1 c2 a b a a 1 1 b b 2 ... | f1 hh2 f b e a a f 1 1 b... | a |
1 | bb | NaN | {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... | [[0.3132246893884286, 0.1335576065684959, 0.42... | c1 c2 a b a a 1 1 b b 2 ... | hh1 qq2 a b a a 1 1 b b ... | b |
2 | cc | [[0.2978295126291556, 0.2876008891935764, 0.85... | {'dicta': [12, 67], 'dictb': (4, 5), 'dictc': ... | [[0.06184828264219733, 0.18545094706008847, 0.... | c1 c2 a b a a 3 3 b b 4 ... | q1 qq2 c3 c4 y ll tt ... | {'ff': 'gg'} |
3 | dd | [[0.4963928696663038, 0.7239167468580807, 0.98... | {'dicta': [1, 23], 'dictb': 3, 'dictc': {'key1... | [[0.8993257213551965, 0.2031570590614975, 0.66... | NaN | NaN | {a, c, b} |
4 | aa | [[0.3031081932962224, 0.6450296792517578, 0.32... | {'dicta': 123, 'dictb': 'words', 'dictc': {'ke... | [[0.5945125943356216, 0.15692780064527623, 0.0... | c1 c2 a b a a 1 1 b b 2 ... | qq1 rr2 a b a a 1 1 b b ... | (r,) |
5 | bb | [[0.5167332905196971, 0.7313676911394652, 0.58... | {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... | [[0.6826944216406796, 0.06614703871524041, 0.4... | c1 c2 a b a a 1 1 b b 2 ... | rr a b c d b a b c d a ... | a b dtype: object |
6 | cc | [[0.708744333480539, 0.8321196509452965, 0.132... | {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... | [[0.6154355882964945, 0.5022137513822291, 0.64... | c1 c2 a b a a 3 3 b b 4 ... | cpp1 c2 a rr a a 3 3 b... | a |
7 | dd | [[0.38385169069805636, 0.4351602576907261, 0.2... | {'dicta': [12, 67], 'dictb': (4, 5), 'dictc': ... | [[0.252611789308698, 0.04665515687154631, 0.04... | NaN | NaN | b |
8 | aa | [[0.8448759939899335, 0.3957149482929728, 0.70... | {'dicta': [1, 23], 'dictb': 3, 'dictc': {'key1... | None | c1 c2 a b a a 1 1 b b 2 ... | sr1 c2 a b a a 1 1 b b ... | 1 |
9 | bb | [[0.19421217606406949, 0.7404434981173305, 0.5... | {'dicta': 123, 'dictb': 'words', 'dictc': {'ke... | [[0.7887921954495609, 0.07707935123200094, 0.5... | c1 c2 c3 a b a a 1 1 ... | cpp1 c2 a b a a 3 3 b b ... | 2 |
10 | cc | [[0.5360890315477351, 0.11323478357643058, 0.7... | {'dicta': [1, 2, 3], 'dictb': 3, 'dictc': {'ke... | [[0.09228121163345293, 0.02214142758664739, 0.... | c1 c2 a b a a 3 3 b b 4 ... | c1 c2 mm b a a 3 3 b b ... | 3 |
11 | dd | [[0.22093473502397243, 0.024389277765600403, 0... | {'dicta': [52, 3], 'dictb': [3, 4], 'dictc': {... | [[0.5195772094908402, 0.2494521739025667, 0.39... | NaN | NaN | 4 |
So this dataframe is quite daunting, I like to call it a puffy table.
Exploding the data out that are in puffy tables¶
Now with puffy_to_long
you can easily unravel this dataframe. Since this dataframe is weirdly constructed puffy_to_long
may take a while:
[3]:
long_df = pb.puffy_to_long(df)
long_df.head()
[3]:
index_level0 | a | b_level0 | b_level1 | b | c_level0 | c_level1 | c | d_level0 | d_level1 | d | e_level0_a | e_level0_b | e_level0_2 | e | f_level0_0 | f_level0_1 | f | g_level0 | g | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | aa | 0.0 | 0.0 | 0.965756 | dicta | 0 | 1 | 0.0 | 0.0 | 0.833218 | a | a | c1 | 1.0 | 0.0 | b | a | NaN | a |
1 | 0 | aa | 0.0 | 0.0 | 0.965756 | dicta | 0 | 1 | 0.0 | 0.0 | 0.833218 | a | a | c1 | 1.0 | 0.0 | e | f | NaN | a |
2 | 0 | aa | 0.0 | 0.0 | 0.965756 | dicta | 0 | 1 | 0.0 | 0.0 | 0.833218 | a | a | c1 | 1.0 | 0.0 | f | a | NaN | a |
3 | 0 | aa | 0.0 | 0.0 | 0.965756 | dicta | 0 | 1 | 0.0 | 0.0 | 0.833218 | a | a | c1 | 1.0 | 0.0 | f1 | 1 | NaN | a |
4 | 0 | aa | 0.0 | 0.0 | 0.965756 | dicta | 0 | 1 | 0.0 | 0.0 | 0.833218 | a | a | c1 | 1.0 | 0.0 | hh2 | 1 | NaN | a |
Now we have a dataframe with only hashable elements. puffy_to_long
iteratively exploded all cells and treated each column individually. For example, if a cell contains a numpy array that is two-dimensional then the column will be exploded twice and two new columns will be added that are called *[COLUMN_NAME]_level0* and *[COLUMN_NAME]_level1*. For our column b
, we get two new columns called b_level0
and b_level1
. These levels will contain the index corresponding to the
data point in the long-format of column b
. Let’s try puffy_to_long
again but just on column b
.
Exploding numpy.array-containing columns¶
[4]:
long_df = pb.puffy_to_long(df, 'b')
long_df.head()
[4]:
index_level0 | b_level0 | b_level1 | b | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 0.965756 |
1 | 0 | 0 | 1 | 0.610598 |
2 | 0 | 0 | 2 | 0.710187 |
3 | 0 | 0 | 3 | 0.851220 |
4 | 0 | 0 | 4 | 0.264982 |
index_level0
is the previous index from our dataframe. If this were a pandas.MultiIndex
, we would have multiple columns instead of just one that corresponds to the old index of the dataframe.
Now, we could take this normal dataframe objects and perform various operations that we would normally want to perform, e.g.:
[5]:
long_df.groupby('b_level0')['b'].mean()
[5]:
b_level0
0 0.489353
1 0.538500
2 0.502751
3 0.490095
4 0.505218
5 0.501370
6 0.474048
7 0.573405
8 0.491318
9 0.516571
Name: b, dtype: float64
Let’s say we want to explode both column b
and column d
that both contain numpy arrays. Imagine that we want to align the axis 1 of the data in b
and d
, and call this axes aligned_axis
, we can do this with puffy_to_long
keyword arguments:
[6]:
long_df = pb.puffy_to_long(df, 'b', 'd', aligned_axis={'b':1, 'd':1})
long_df.head()
[6]:
index_level0 | b_level0 | aligned_axis | b | d_level0 | d | |
---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0 | 0.965756 | 0.0 | 0.833218 |
1 | 0 | 0.0 | 0 | 0.965756 | 1.0 | 0.060445 |
2 | 0 | 0.0 | 0 | 0.965756 | 2.0 | 0.620360 |
3 | 0 | 0.0 | 0 | 0.965756 | 3.0 | 0.610501 |
4 | 0 | 0.0 | 0 | 0.965756 | 4.0 | 0.671548 |
So now, axis 1 of b
and d
column data are aligned and those indices are defined in the aligned_axis
column. Since both columns contain missing cells, some index_level0
values have missing b
and b_level0
columns or missing d_level0
and d
columns. You can view these easily using standard pandas functionality:
[7]:
long_df.loc[long_df['b_level0'].isnull()].head()
[7]:
index_level0 | b_level0 | aligned_axis | b | d_level0 | d | |
---|---|---|---|---|---|---|
10650 | 1 | NaN | 0 | NaN | 0.0 | 0.313225 |
10651 | 1 | NaN | 0 | NaN | 1.0 | 0.630630 |
10652 | 1 | NaN | 0 | NaN | 2.0 | 0.074653 |
10653 | 1 | NaN | 0 | NaN | 3.0 | 0.290296 |
10654 | 1 | NaN | 0 | NaN | 4.0 | 0.563310 |
[8]:
long_df.loc[long_df['d_level0'].isnull()].head()
[8]:
index_level0 | b_level0 | aligned_axis | b | d_level0 | d | |
---|---|---|---|---|---|---|
9950 | 8 | 0.0 | 0 | 0.844876 | NaN | NaN |
9951 | 8 | 1.0 | 0 | 0.905452 | NaN | NaN |
9952 | 8 | 2.0 | 0 | 0.567938 | NaN | NaN |
9953 | 8 | 3.0 | 0 | 0.043224 | NaN | NaN |
9954 | 8 | 4.0 | 0 | 0.736017 | NaN | NaN |
Exploding pandas.DataFrame-containing columns¶
Let’s take a look at how pandas.DataFrame
objects are handled within puffy_to_long
by taking a look at column e
:
[9]:
long_df = pb.puffy_to_long(df, 'e')
long_df.head()
[9]:
index_level0 | e_level0_a | e_level0_b | e_level0_2 | e | |
---|---|---|---|---|---|
0 | 0 | a | a | c1 | 1.0 |
1 | 0 | a | a | c2 | 1.0 |
2 | 0 | b | b | c1 | 2.0 |
3 | 0 | b | b | c2 | 2.0 |
4 | 0 | c | c | c1 | 3.0 |
pandas.DataFrame
objects are handled within one explosion iteration, unless the cell within that dataframe are non-hashable. This is why all new columns contain level0
. The first two new columns e_level0_a
and e_level0_b
correspond to the pandas.MultiIndex
index defined in all dataframes within this column. e_level0_2
corresponds to all the columns names of the dataframe. e
only contains the data within each cell of each dataframe.
Let’s say we don’t want to unravel our columns in this way but instead just concatenate them all together. We can use the expand_cols
argument for this, which expects a list of column names that contain only pandas.DataFrame
or pandas.Series
objects:
[10]:
long_df = pb.puffy_to_long(df, 'e', expand_cols=['e'])
long_df.head()
[10]:
index_level0 | a | b | e_c1 | e_c2 | e_c3 | |
---|---|---|---|---|---|---|
0 | 0 | a | a | 1 | 1 | NaN |
1 | 0 | b | b | 2 | 2 | NaN |
2 | 0 | c | c | 3 | 3 | NaN |
3 | 1 | a | a | 1 | 1 | NaN |
4 | 1 | b | b | 2 | 2 | NaN |
Here we preserved the columns of each dataframe in each cell and simply concatenated the dataframes together with the index_level0
information preserved. What if we use this method while also exploding column a
, since this has the same column name as the column in the dataframes within column e
:
[11]:
long_df = pb.puffy_to_long(df, 'a', 'e', expand_cols=['e'])
long_df.head()
[11]:
index_level0 | a | a_e | b | e_c1 | e_c2 | e_c3 | |
---|---|---|---|---|---|---|---|
0 | 0 | aa | a | a | 1.0 | 1.0 | NaN |
1 | 0 | aa | b | b | 2.0 | 2.0 | NaN |
2 | 0 | aa | c | c | 3.0 | 3.0 | NaN |
3 | 1 | bb | a | a | 1.0 | 1.0 | NaN |
4 | 1 | bb | b | b | 2.0 | 2.0 | NaN |
Since the column a
already existed, the column a
within each dataframe within column e
was renamed to a_e
.
Of course, similarly, this is handled with column b
:
[12]:
long_df = pb.puffy_to_long(df, 'b', 'e', expand_cols=['e'])
long_df.head()
[12]:
index_level0 | b_level0 | b_level1 | b | a | b_e | e_c1 | e_c2 | e_c3 | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | 0.0 | 0.965756 | a | a | 1.0 | 1.0 | NaN |
1 | 0 | 0.0 | 0.0 | 0.965756 | b | b | 2.0 | 2.0 | NaN |
2 | 0 | 0.0 | 0.0 | 0.965756 | c | c | 3.0 | 3.0 | NaN |
3 | 0 | 0.0 | 1.0 | 0.610598 | a | a | 1.0 | 1.0 | NaN |
4 | 0 | 0.0 | 1.0 | 0.610598 | b | b | 2.0 | 2.0 | NaN |
Less structured dataframe-containing columns will result in more complex long-format dataframes:
[13]:
long_df = pb.puffy_to_long(df, 'f')
long_df.head()
[13]:
index_level0 | f_level0_0 | f_level0_1 | f | |
---|---|---|---|---|
0 | 0 | 0 | b | a |
1 | 0 | 0 | e | f |
2 | 0 | 0 | f | a |
3 | 0 | 0 | f1 | 1 |
4 | 0 | 0 | hh2 | 1 |
[14]:
long_df = pb.puffy_to_long(df, 'f', expand_cols=['f'])
long_df.head()
[14]:
index_level0 | level_1 | f_f | f_b | f_e | f_f1 | f_hh2 | f_a | f_hh1 | f_qq2 | ... | f_('a', 'a') | f_('b', 'b') | f_('c', 'c') | f_('d', 'd') | f_rr | f_cpp1 | f_c2 | f_sr1 | f_mm | f_c1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | a | a | f | 1.0 | 1.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0 | 1 | b | b | f | 2.0 | 2.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 0 | 2 | c | c | f | 3.0 | 3.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 1 | 0 | NaN | a | NaN | NaN | NaN | a | 1.0 | 1.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 1 | 1 | NaN | b | NaN | NaN | NaN | b | 2.0 | 2.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 28 columns
Exploding dictionaries¶
The column c
contains dictionaries with various data types that are in them. The puffy_to_long
algorithm iteratively explodes all objects within the dictionaries:
[15]:
long_df = pb.puffy_to_long(df, 'c')
long_df.head()
[15]:
index_level0 | c_level0 | c_level1 | c | |
---|---|---|---|---|
0 | 0 | dicta | 0 | 1 |
1 | 0 | dicta | 1 | 2 |
2 | 0 | dicta | 2 | 3 |
3 | 0 | dictb | NaN | 3 |
4 | 0 | dictc | key1 | 1 |
Since some values within dictionaries can be further exploded, while others cannot some levels/axes contain NaNs when the explosion iteration for that data type stopped (for a specific row).