Data/Dacon
μ§ κ° μμΈ‘ λΆμ...1
Kirok Kim
2022. 2. 3. 20:34
λͺ λͺ©ν λ°μ΄ν° λ³ν λ° ννΈλ§΅μ μμΈν λ΄μ©μ 3μ₯μμ λ€λ£° μμ
μ΅μ(μ°κ΅¬μ€)
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
# 1. train.csv : νμ΅ λ°μ΄ν°
# id : λ°μ΄ν° κ³ μ id
# OverallQual : μ λ°μ μ¬λ£μ λ§κ° νμ§
# YearBuilt : μ곡 μ°λ
# YearRemodAdd : 리λͺ¨λΈλ§ μ°λ
# ExterQual : μΈκ΄ μ¬λ£ νμ§
# BsmtQual : μ§νμ€ λμ΄
# TotalBsmtSF : μ§νμ€ λ©΄μ
# 1stFlrSF : 1μΈ΅ λ©΄μ
# GrLivArea : μ§μμΈ΅ μν λ©΄μ
# FullBath : μ§μμΈ΅ νμ₯μ€ κ°μ
# KitchenQual : λΆμ΅ νμ§
# GarageYrBlt : μ°¨κ³ μ곡 μ°λ
# GarageCars: μ°¨κ³ μ리 κ°μ
# GarageArea: μ°¨κ³ λ©΄μ
# target : μ§κ°(λ¬λ¬ λ¨μ)
data=pd.read_csv('/content/drive/MyDrive/μ§κ°μμΈ‘λΆμ/train.csv')
data.drop('id',axis=1,inplace=True)
data
# λ°μ΄ν°λ₯Ό κ°μ Έμ¨ λ€ κ²°μΈ‘μΉ νμΈμ νμλ€
def check(data):
mcol = []
for col in data.columns:
mv = sum(data[col].isna())
is_missing = True if mv >= 1 else False
if is_missing:
print(f'κ²°μΈ‘ {col}')
print(f'{mv} κ°')
mcol.append([col, data[col].dtype])
if mcol == []:
print('x')
return mcol
mcol = check(data)
data.describe()
data.info()
# λ¨Όμ μκ΄κ³μ κ³μ°μ μν΄ ν
μ€νΈ νμμ λ°μ΄ν°λ₯Ό μ«μλ‘ λ³νν΄μ€.
from sklearn.preprocessing import LabelEncoder
corr_df = data.copy()
corr_df[corr_df.columns[corr_df.dtypes=='O']] = corr_df[corr_df.columns[corr_df.dtypes=='O']].astype(str).apply(LabelEncoder().fit_transform)
corr_df['Exter Qual']
## μκ΄κ΄κ³λ₯Ό λΆμν΄λ³΄λ©΄ λ°λΉλ‘μ μμκ° λ§μ
μ΄ λΆλΆμ sklearnμ ν΅ν΄ μ λλ‘ λ μμΉνκ° μ΄λ£¨μ΄μ§μ§ μμλ€κ³ μκ°ν¨.
#μκ΄κ΄κ³ λΆμλ μ΄κ² λ°μ΄ν° λΆμνλλ°μ μ μΌ μ μ©ν μκ°νκ° μλκ° μΆλ€
plt.figure(figsize=(15,10))
heat_table = corr_df.corr()
mask = np.zeros_like(heat_table)
mask[np.triu_indices_from(mask)] = True
heatmap_ax = sns.heatmap(heat_table, annot=True, mask = mask, cmap='coolwarm')
heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), fontsize=15, rotation=45)
# κΈμ κΈ°μΈμ΄κΈ° λ° ν°νΈμ¬μ΄μ¦ κ°λλ λ°μκ³ λ°©ν₯
heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), fontsize=15)
plt.title('correlation between features', fontsize=40)
plt.show()
# sns.heatmap μ λν μ λ°μ μΈ μ§μμ΄ λΆμ‘± μμΉκ° νμ...
λ°μν