x_train=train.drop(columns=['Segmentation'])
x_test=test
y_train=train['Segmentation']
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(x_train.info())
print(x_test.info())
print(y_train.info())

์ฌ์ง ์ค๋ช ์ ์ ๋ ฅํ์ธ์.
๊ฒฐ์ธก์น๊ฐ ์์ฒญ ๋ง๋ค ๋๋์ด ๋์๊ตฌ๋ ๊ฒฐ์ธก์น ์๋ ๋ฐ์ดํฐ๊ฐ!
print(x_train.describe())
print(y_train.describe())
print(x_test.describe())

์ฌ์ง ์ค๋ช ์ ์ ๋ ฅํ์ธ์.
์ด์์น๊ฐ ์๋์ง ํ์ธํด๋ดค์ผ๋ ์์์
print(x_train.isnull().sum())
print(x_test.isnull().sum())
์ฌ์ง ์ค๋ช ์ ์ ๋ ฅํ์ธ์.
ํ์ง๋ง ๊ฒฐ์ธก์น๊ฐ ์์ฒญ ๋ง๊ณ ...
# x_train : Ever_Married, Graduated, Profession, Work_Experience(์์น), Family_Size(์์น), Var_1 ๊ฒฐ์ธก์น
# x_test : Ever_Married, Graduated, Profession, Work_Experience, Family_Size, Var_1
x_train=x_train.drop(columns=['Work_Experience'])
x_test=x_test.drop(columns=['Work_Experience'])
family_median=x_train['Family_Size'].median()
ever_mode=x_train['Ever_Married'].mode()
graduated_mode=x_train['Graduated'].mode()
var_mode=x_train['Var_1'].mode()
profession_mode=x_train['Profession'].mode()
x_train['Family_Size']=x_train['Family_Size'].fillna(family_median)
x_train['Ever_Married']=x_train['Ever_Married'].fillna(ever_mode[0])
x_train['Graduated']=x_train['Graduated'].fillna(graduated_mode[0])
x_train['Var_1']=x_train['Var_1'].fillna(var_mode[0])
x_train['Profession']=x_train['Profession'].fillna(profession_mode[0])
x_test['Family_Size']=x_test['Family_Size'].fillna(family_median)
x_test['Ever_Married']=x_test['Ever_Married'].fillna(ever_mode[0])
x_test['Graduated']=x_test['Graduated'].fillna(graduated_mode[0])
x_test['Var_1']=x_test['Var_1'].fillna(var_mode[0])
x_test['Profession']=x_test['Profession'].fillna(profession_mode[0])
** ์ฃผ์์ฌํญ : train ๋ฐ์ดํฐ์ ์ค์๊ฐ์ผ๋ก test ๋ฐ์ดํฐ๋ ๋ณ๊ฒฝํด์ค์ผ ํจ **
# ์ฐ์ํ ๋ณ์ : ์ค์๊ฐ, ํ๊ท ๊ฐ
# df['๋ณ์๋ช
'].median()
# df['๋ณ์๋ช
'].mean()
# ๋ฒ์ฃผํ ๋ณ์ : ์ต๋น๊ฐ
# df['๋ณ์๋ช
'] = df['๋ณ์๋ช
'].fillna(๋์ฒดํ ๊ฐ)
## ์ค์๊ฐ ๋์ฒด ์์ ##
med_age = x_train['age'].median()
x_train['age'] = x_train['age'].fillna(med_age)
x_test['age'] = x_test['age'].fillna(med_age)
**์ฃผ์์ฌํญ: train ๋ฐ์ดํฐ์ ์ต๋น๊ฐ์ [0]๋ฅผ ๊ฐ์ ธ์์ผ ํจ **
ever_mode=x_train['Ever_Married'].mode()
x_test['Ever_Married']=x_test['Ever_Married'].fillna(ever_mode[0]) # ์ต๋น๊ฐ [0] ์ฃผ์
์๋ฌดํผ ๊ฒฐ์ธก์น ๋ค ์ฑ์์ฃผ๊ณ , ๋ฒ๋ฆด ์นผ๋ผ์ ๋ฒ๋ ธ์, work_experience๋ฅผ ๋์ฒดํด๋ ๋์ง๋ง ๊ฐ์ด ์ปค์ ๊ทธ๋ฅ ๋ฒ๋ ธ๋ค
์ด์์น๋ ์์ผ๋ ํจ์คํ๊ณ ๋ณ์ ์ ๊ฑฐ, ์ํซ ์ธ์ฝ๋ฉ ํด์ค์ผํจ (์ธ์ฝ๋ฉ ์ฌ์ค ๊น๋จน์ด์ ์ค๋ฅ๋์ ๋ค์ ์ฌ๋ผ๊ฐ์ ํจ ใ ใ ..)
ID=x_test['ID'].copy()
x_train=x_train.drop(columns=['ID'])
x_test=x_test.drop(columns=['ID'])
ID ์นผ๋ผ์ ํ์์์ผ๋ ๋๋ํด์ฃผ์
df = df.drop(columns = ['๋ณ์1','๋ณ์2'])
df = df.drop(['๋ณ์1','๋ณ์2'], axis=1)
<ํ์์๋ ์นผ๋ผ ์ ๊ฑฐํ๋ ๋ฐฉ๋ฒ์ ์์ ๊ฐ๋ค>
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
print(x_train.info())
print(x_test.info())
์ํซ ์ธ์ฝ๋ฉ ํด๋ณด๊ณ x_train, x_test ์นผ๋ผ ๊ฐ์์ ์์ ๋น๊ตํ๋๋ฐ ์ผ์นํด์ reindex๊ฐ ํ์ ์์
from sklearn.model_selection import train_test_split
x_train,x_val,y_train,y_val=train_test_split(x_train,
y_train,
stratify=y_train,
test_size=0.2,
random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(y_train.shape)
print(y_val.shape)
๋ถ๋ฅ๋ถ์ ์ด๋๊น stratify ๊ผญ ์จ์ฃผ์!
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(random_state=2024)
model.fit(x_train,y_train)
y_pred=model.predict(x_val)
from sklearn.metrics import f1_score
f1=f1_score(y_val,y_pred, average='macro')
print(f1)
**์ฃผ์์ฌํญ:y๊ฐ์ด ๋ค์ค๋ถ๋ฅ๋๊น macro ์์ง๋ง๊ณ ์ฐ๊ธฐ**
y_result=model.predict(x_test)
result=pd.DataFrame({'ID':ID,'Segmentation':y_result})
result.to_csv('datafox',index=False)
pd.read_csv("datafox")
๋!