import pandas as pd
train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e5_p2_train_.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/krdatacertificate/e5_p2_test_.csv')
display(train.head(2))
test.head(2)
x_train=train.drop(columns=['price'])
y_train=train['price']
x_test=test
print(x_train.info())
print(y_train.info())
print(x_test.info())
๋ฐ์ดํฐ ๋ถ๋ฌ์จ ๋ค์์ ๋๋ ์คฌ๋ค info๋ก ๋ฐ์ดํฐ ํ์ ํ์ธํ์ > ์ํซ ์ธ์ฝ๋ฉ ํ์
#๋ฐ์ดํฐ ํฌ๊ธฐ
print(x_test.describe())
print(x_train.describe())
print(y_train.describe())
๋ฐ์ดํฐ ์ด์์น ํ์ธํด๋ณด๋ ค๊ณ ๊ธฐ์ดํต๊ณ๋ ํจ์ ์จ์ min, max ๋น๊ตํด๋ดค๋๋ฐ ์ด์์น๋ ๋ฑํ ์์๋ค
print(x_train.isnull().sum())
print(x_test.isnull().sum())
print(y_train.isnull().sum())
๊ฒฐ์ธก์น๋ ์์๋ค
ID=x_test['ID'].copy()
x_train=x_train.drop(columns='ID')
x_test=x_test.drop(columns='ID')
ID๋ ํ์ ์๋ ๋ณ์๋๊น ์ ๊ฑฐํด์คฌ๊ณ
x_train=pd.get_dummies(x_train)
x_test=pd.get_dummies(x_test)
๊ทผ๋ฐ ๋ฌธ์ ๊ฐ ์๊ฒผ๋ค
์ฌ์ง ์ค๋ช ์ ์ ๋ ฅํ์ธ์.
์ํซ ์ธ์ฝ๋ฉ ํด์ฃผ๊ณ ๋๋ x_train๊ณผ x_test ์นผ๋ผ ๊ฐ์๋, ์์๋ ๋ค๋ฆ... train ๋ฐ์ดํฐ์๋ test์ ์๋ ๋ฐ์ดํฐ๋ค์ด ์์์
์ด๊ฑธ ์ด์ฐํ๋ ๊ณ ๋ฏผํ๋ค๊ฐ
x_test = x_test.reindex(columns = x_train.columns, fill_value=0)
๋ง์ ์นผ๋ผ์ ๊ธฐ์ค์ผ๋ก ์ ์ ์นผ๋ผ์ reindex ํด์คฌ๋ค
๊ทธ๋ฌ๊ณ ๋์ ์นผ๋ผ ๊ฐ์๋, ์์๋ ๋์ผํ๊ฒ ๋๊ฒ์ ํ์ธํจ
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(x_train,
y_train,
test_size=0.2,
random_state=2024)
print(x_train.shape)
print(x_val.shape)
print(x_val.shape)
print(y_train.shape)
์ดํ๋ก๋ x_train, y_train ๋ถํ ํด์ฃผ๊ณ (ํ๊ท ์ธ๊ฑฐ๋ผ์ stratify ํ์์์)
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(random_state=2024)
model.fit(x_train,y_train)
y_pred=model.predict(x_val)
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(y_val,y_pred)
rmse=mse**0.5
print(rmse)
y_result=model.predict(x_test)
result=pd.DataFrame({'ID':ID,'Target':y_result})
result.to_csv('datafox',index=False)
pd.read_csv("datafox")
์ ๋ต๊น์ง ๋ง๊ฒ ๋์ด!
5ํ์ ํต์ฌ
x_test = x_test.reindex(columns = x_train.columns, fill_value=0)
'๐ ์๊ฒฉ์ฆ, ์ดํ' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 3ํ- 2์ ํ prob (0) | 2024.08.20 |
---|---|
[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 4ํ - 2์ ํ ๊ฒฐ์ธก์น ๋์ฒด,drop (0) | 2024.08.20 |
๏ปฟ[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 6ํ - 2์ ํ macro (0) | 2024.08.20 |
[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 7ํ - 2์ ํ RandomForestRegressor (0) | 2024.08.20 |
[๋น ๋ฐ์ดํฐ ๋ถ์๊ธฐ์ฌ] ์ค๊ธฐ 2ํ - 1์ ํ ์ด์์น (1) | 2024.08.19 |