Python Code Snippets – Import and Cleaning

Importing and Cleaning

Importing

Pickle-files

import pandas as pd
x = pd.read_pickle('data.pkl')

CSV Files

# Use only certain columns
import pandas as pd
cols= ['style', 'type', 'price']
wine= pd.read_csv(file_name, usecols= cols)
print(wine.head())
# - values in the CSV are replaced by NaN
import pandas as pd
candy= pd.read_csv(file_name, na_values= '-')
print(candy.head())

TXT Files

data = "data.txt"
file = open(data, mode="r")
text = file.read()
file.close()
# import a txt file with numeric columns to a np_array
import numpy as np
np.loadtxt("data.txt", delimiter="\t", dtype="int")

MATLAB file

# transform a matlab file to python dictionnary
import scipy.io
import numpy as np
mat = scipy.io.loadmat('file.mat')
type(mat)

Excel File

import pandas as pd
import xlrd
xls = pd.read_excel(url)

 

Cleaning

# Split email in two columns
contact.email.str.split('@', expand = True)
# Fill all NaN values with 80
df.fillna(80)

# Fill all NaN values with the next value
df.fillna(method = 'ffill')
# Transform a object column to category
user_xp['status'] = user_xp['status'].astype('category')
user_xp.info()

 

Pivoting a dataframe from wide to long format

# rows = store_id , column = type of store
store_wide= store_info.pivot_table(index= 'store_id', columns= 'type', values= 'sales')
print(store_wide)
# given a Name Height Weight table --> transform it to Name Variable value table
import pandas as pd
pd.melt(measure, id_vars = 'Name', value_vars = 'Height')
# given a table with store-2018-2019-2020
# transform it to store- year - sales
store_long= store_sales.melt(id_vars= 'store', var_name= 'year', value_name= 'sales')
print(store_long.head())