[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 5ํ - 2์ ํ x_train๊ณผ x_test ๊ฐ์๊ฐ ๋ค๋ฅผ๋ reindex ์ฌ์ฉ
import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e5_p2_train_.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e5_p2_test_.csv')
display(train.head(2))
test.head(2)
x_train=train.drop(columns=['price'])
y_train=train['price']
x_test=test
print(x_train.info())
print(y_train.info())
print(x_test.info())
๋ฐ์ดํฐ ๋ถ๋ฌ์จ ๋ค์์ ๋๋ ์คฌ๋ค info๋ก ๋ฐ์ดํฐ ํ์ ํ์ธํ์ > ์ํซ ์ธ์ฝ๋ฉ ํ์
#๋ฐ์ดํฐ ํฌ๊ธฐ
print(x_test.describe())
print(x_train.describe())
print(y_train.describe())
๋ฐ์ดํฐ ์ด์์น ํ์ธํด๋ณด๋ ค๊ณ ๊ธฐ์ดํต๊ณ๋ ํจ์ ์จ์ min, max ๋น๊ตํด๋ดค๋๋ฐ ์ด์์น๋ ๋ฑํ ์์๋ค
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())
๊ฒฐ์ธก์น๋ ์์๋ค
ID=x_test['ID'].copy()
x_train=x_train.drop(columns='ID')
x_test=x_test.drop(columns='ID')
ID๋ ํ์ ์๋ ๋ณ์๋๊น ์ ๊ฑฐํด์คฌ๊ณ
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
๊ทผ๋ฐ ๋ฌธ์ ๊ฐ ์๊ฒผ๋ค
์ฌ์ง ์ค๋ช ์ ์ ๋ ฅํ์ธ์.
์ํซ ์ธ์ฝ๋ฉ ํด์ฃผ๊ณ ๋๋ x_train๊ณผ x_test ์นผ๋ผ ๊ฐ์๋, ์์๋ ๋ค๋ฆ... train ๋ฐ์ดํฐ์๋ test์ ์๋ ๋ฐ์ดํฐ๋ค์ด ์์์
์ด๊ฑธ ์ด์ฐํ๋ ๊ณ ๋ฏผํ๋ค๊ฐ
x_test = x_test.reindex(columns = x_train.columns, fill_value=0)
๋ง์ ์นผ๋ผ์ ๊ธฐ์ค์ผ๋ก ์ ์ ์นผ๋ผ์ reindex ํด์คฌ๋ค
๊ทธ๋ฌ๊ณ ๋์ ์นผ๋ผ ๊ฐ์๋, ์์๋ ๋์ผํ๊ฒ ๋๊ฒ์ ํ์ธํจ
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
y_train,
test_size=0.2,
random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(x_val.shape)
print(y_train.shape)
์ดํ๋ก๋ x_train, y_train ๋ถํ ํด์ฃผ๊ณ (ํ๊ท ์ธ๊ฑฐ๋ผ์ stratify ํ์์์)
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(random_state=2024)
model.fit(x_train,y_train)
y_pred=model.predict(x_val)
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_val,y_pred)
rmse=mse**0.5
print(rmse)
y_result=model.predict(x_test)
result=pd.DataFrame({'ID':ID,'Target':y_result})
result.to_csv('datafox',index=False)
pd.read_csv("datafox")
์ ๋ต๊น์ง ๋ง๊ฒ ๋์ด!
5ํ์ ํต์ฌ
x_test = x_test.reindex(columns = x_train.columns, fill_value=0)