Importing and Cleaning
Importing
Pickle-files
import pandas as pd x = pd.read_pickle('data.pkl')
CSV Files
# Use only certain columns import pandas as pd cols= ['style', 'type', 'price'] wine= pd.read_csv(file_name, usecols= cols) print(wine.head())
# - values in the CSV are replaced by NaN import pandas as pd candy= pd.read_csv(file_name, na_values= '-') print(candy.head())
TXT Files
data = "data.txt" file = open(data, mode="r") text = file.read() file.close()
# import a txt file with numeric columns to a np_array import numpy as np np.loadtxt("data.txt", delimiter="\t", dtype="int")
MATLAB file
# transform a matlab file to python dictionnary import scipy.io import numpy as np mat = scipy.io.loadmat('file.mat') type(mat)
Excel File
import pandas as pd import xlrd xls = pd.read_excel(url)
Cleaning
# Split email in two columns contact.email.str.split('@', expand = True)
# Fill all NaN values with 80 df.fillna(80) # Fill all NaN values with the next value df.fillna(method = 'ffill')
# Transform a object column to category user_xp['status'] = user_xp['status'].astype('category') user_xp.info()
Pivoting a dataframe from wide to long format
# rows = store_id , column = type of store store_wide= store_info.pivot_table(index= 'store_id', columns= 'type', values= 'sales') print(store_wide)
# given a Name Height Weight table --> transform it to Name Variable value table import pandas as pd pd.melt(measure, id_vars = 'Name', value_vars = 'Height')
# given a table with store-2018-2019-2020 # transform it to store- year - sales store_long= store_sales.melt(id_vars= 'store', var_name= 'year', value_name= 'sales') print(store_long.head())