作業1_1「資料前處理」.ipynb

作業1_1「資料前處理」.html

Google Colab

設計說明

請依序回答下列問題

請參考範例2-4鐵達尼號生存預測.ipynb作答

2_4鐵達尼號生存預測_102.html

結論

# 導入需要套件
import sklearn
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

# 取得房價資料
titanic = pd.read_csv("/content/drive/MyDrive/titanic.csv")
print(titanic.head())
print()

# 查看是否有遺失值(大筆數資料)
print(titanic.isnull().any())
print()

# 以中位數補值
median = titanic['Age'].median()
print('中位數: ', median)
print()
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())

# 查看是否有遺失值(大筆數資料)
print(titanic.isnull().any())
print()

# 將PClass欄位編碼成數值,以便處理
X_temp = titanic.loc[:, ['PClass']].values
y = titanic.loc[:, 'Survived']
le = LabelEncoder()
X_temp = le.fit_transform(X_temp)
print(le.classes_)
print()

# 合併X所需表格
X_temp = pd.DataFrame(X_temp, columns=['PClass'])
X = pd.concat([pd.DataFrame(X_temp), titanic['Age']], axis=1)
print(X.head())
print()

# 建立與訓練模型
logistic = linear_model.LogisticRegression()
logistic.fit(X, y)

# 求混淆矩陣(Confusion Matrix),計算準確度
print('Confusion Matrix')
preds = logistic.predict(X)
print(pd.crosstab(preds, titanic['Survived']))
print(logistic.score(X, y))