Importing and Cleaning
Importing
Pickle-files
1 2 |
import pandas as pd x = pd.read_pickle('data.pkl') |
CSV Files
1 2 3 4 5 |
# Use only certain columns import pandas as pd cols= ['style', 'type', 'price'] wine= pd.read_csv(file_name, usecols= cols) print(wine.head()) |
1 2 3 4 |
# - values in the CSV are replaced by NaN import pandas as pd candy= pd.read_csv(file_name, na_values= '-') print(candy.head()) |
TXT Files
1 2 3 4 |
data = "data.txt" file = open(data, mode="r") text = file.read() file.close() |
1 2 3 |
# import a txt file with numeric columns to a np_array import numpy as np np.loadtxt("data.txt", delimiter="\t", dtype="int") |
MATLAB file
1 2 3 4 5 |
# transform a matlab file to python dictionnary import scipy.io import numpy as np mat = scipy.io.loadmat('file.mat') type(mat) |
Excel File
1 2 3 |
import pandas as pd import xlrd xls = pd.read_excel(url) |
Cleaning
1 2 |
# Split email in two columns contact.email.str.split('@', expand = True) |
1 2 3 4 5 |
# Fill all NaN values with 80 df.fillna(80) # Fill all NaN values with the next value df.fillna(method = 'ffill') |
1 2 3 |
# Transform a object column to category user_xp['status'] = user_xp['status'].astype('category') user_xp.info() |
Pivoting a dataframe from wide to long format
1 2 3 |
# rows = store_id , column = type of store store_wide= store_info.pivot_table(index= 'store_id', columns= 'type', values= 'sales') print(store_wide) |
1 2 3 |
# given a Name Height Weight table --> transform it to Name Variable value table import pandas as pd pd.melt(measure, id_vars = 'Name', value_vars = 'Height') |
1 2 3 4 |
# given a table with store-2018-2019-2020 # transform it to store- year - sales store_long= store_sales.melt(id_vars= 'store', var_name= 'year', value_name= 'sales') print(store_long.head()) |