Dummies your dataset and inverse it

Dummies your dataset and Inverse it!

There are plenty of ways to get your dataset dummies. Here is some methods that I’ve known so far:

  • The simplest way : Pandas.get_dummies
  • Scikit-learn Style: OneHotEncoder, MultiLabelEnocder, LabelBinarizer, LabelEncoder

But all of them do not provide a way of encoding your dataset and then inverse them. In other words, if you have a dataset with categorical data, then you get dummies them, but if you want to inverse your data to get the original dataset back, it seems no way. So if you are focusing on a generative model and want to check its performance, you will get a headache. Therefore, I write a simple Class name MultiLabelEnocder_lsb which helps you to get over this problem. Here is an example on how to use it:

1
2
3
4
import pandas as pd
import numpy as np
# I use MultiLabelBinarizer inner MultiLabelEncoder_lsb for the convenience of my work
from sklearn.preprocessing import MultiLabelBinarizer

Prepare your data:

1
2
3
testdata = pd.DataFrame({'pet': ['cat', 'dog', 'dog', 'fish'],                         
'age': [4 , 6, 3, 3],
'salary':[4, 5, 1, 1]})

Output

1
2
3
4
5
	age	pet	salary
0 4 cat 4
1 6 dog 5
2 3 dog 1
3 3 fish 1

Get your sword:

1
2
3
4
my_lsb=MultiLabelBinarizer_lsb()
# [0,1,0] here is the categorical data you want to encode
# 0 means not categorical, 1 means categorical (for columns)
test_X=my_lsb.fit(testdata, [0,1,0])

Here is what’s the test_X looks like:

1
2
3
4
5
6

age 0 1 2 salary
0 4 1 0 0 4
1 6 0 1 0 5
2 3 0 1 0 1
3 3 0 0 1 1

Transform your dataset back:

1
my_lsb.inverse_transform(test_X)

Output:

1
2
3
4
5
	age	pet	  salary
0 4 cat 4
1 6 dog 5
2 3 dog 1
3 3 fish 1

Source Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class MultiLabelBinarizer_lsb(object):
def __init__(self):
self.mlb_dict = {}
self.mlb_encode_size = {}
self.encode_list = []
self.col_name_list=[]

def fit(self, X, encode_list):
self.encode_list = encode_list
self.col_name_list = X.columns.values.tolist()
shape_x, shape_y = X.shape
new_X = pd.DataFrame(np.zeros((shape_x, 1)),columns=['zero'])
size_index = 0
for index,col_cat in enumerate(encode_list):
if col_cat == 0:
new_X = pd.concat([new_X, X.iloc[:,index]], axis=1) #横向拼接
self.mlb_encode_size[index] = (size_index, size_index+1)
size_index += 1
else:
self.mlb_dict[index] = MultiLabelBinarizer()
col_encoded = pd.DataFrame(self.mlb_dict[index].fit_transform(X.iloc[:,index].reshape(-1,1)))
new_X = pd.concat([new_X, col_encoded], axis=1)
_, encoded_size = col_encoded.shape
self.mlb_encode_size[index] = (size_index, size_index+encoded_size)
size_index += encoded_size
new_X.drop(['zero'],axis=1,inplace=True)
return new_X

def transform(self, X):
encode_list = self.encode_list
shape_x, shape_y = X.shape
new_X = pd.DataFrame(np.zeros((shape_x, 1)),columns=['zero'])
size_index = 0
for index,col_cat in enumerate(encode_list):
if col_cat == 0:
new_X = pd.concat([new_X, X.iloc[:,index]], axis=1) #横向拼接
size_index += 1
else:
col_encoded = pd.DataFrame(self.mlb_dict[index].transform(X.iloc[:,index].reshape(-1,1)))
new_X = pd.concat([new_X, col_encoded], axis=1)
_, encoded_size = col_encoded.shape
#self.mlb_encode_size[index] = (size_index, size_index+encoded_size)
size_index += encoded_size
new_X.drop(['zero'],axis=1,inplace=True)
return new_X

def inverse_transform(self, encode_X):
encode_list = self.encode_list
shape_x, shape_y = encode_X.shape
new_X = pd.DataFrame(np.zeros((shape_x, 1)),columns=['zero'])
for index,col_cat in enumerate(encode_list):
if col_cat == 0:
start_index, end_index = self.mlb_encode_size[index]
#print(encode_X.iloc[:,start_index:end_index])
new_X = pd.concat([new_X, encode_X.iloc[:,start_index:end_index]], axis=1) #横向拼接
else:
mlb = self.mlb_dict[index]
start_index, end_index = self.mlb_encode_size[index]
to_inverse_X = np.array(encode_X.iloc[:, start_index:end_index])
#print(to_inverse_X)
inverse_X = pd.DataFrame(mlb.inverse_transform(to_inverse_X))
#print(inverse_X)
new_X = pd.concat([new_X, inverse_X], axis=1)
#self.mlb_encode_size[index] = (size_index, size_index+encoded_size)
new_X.drop(['zero'],axis=1,inplace=True)
new_X.columns = self.col_name_list
return new_X