K近鄰思想: 根據你的"鄰居們"來确定你的類别
你一覺醒來,不知道自己身在何方裡,你能通過計算機定位到周圍5個"最近的"鄰居,其中有4個身處火星,1個身處月球,你認為應該自己距火星更近,自己應該在火星...(K近鄰算法又稱為Knn算法,屬于分類算法)
案例1
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
def knncls():
"""
預測電影分類
:return:
"""
data = pd.read_csv("./data/movies.csv")
# 提取特征值, 目标值
x = data.drop(["type", "movie_name"], axis=1)
y = data["type"]
# 分割資料集
x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.25)
# 通過knn進行預測
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_predict = knn.predict(x_test)
print(x_test, "的預測結果為:", y_predict)
print("預測準确率為:", knn.score(x_test, y_test))
if __name__ == '__main__':
knncls()
準确率
movie_name,fight,kiss,type
California Man,3,104,1
He's not Really into dues,2,100,1
Beautiful Woman,1,81,1
Kevin Longblade,101,10,2
Robo Slayer 3000,99,5,2
Amped II,98,2,2
unname,18,90,1
vampire,90,15,2
facebook預測入住地點
train_data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
def knncls():
"""
facebook題目:k近鄰算法預測入住位置
:return:
"""
# 使用pandas讀取100000資料
train_data = pd.read_csv("./data/fb/train.csv", nrows = 100000)
# 特征工程
# 1.縮小x,y的範圍
train_data = train_data.query("x>1.0 & x<1.5 & y>1.0 & y<2.5")
# 2.解析時間戳
time_value = pd.to_datetime(train_data["time"], unit="s")
time_value = pd.DatetimeIndex(time_value)
# 3.添加特征(時間)
train_data["weekday"] = time_value.weekday
train_data["year"] = time_value.day
train_data["hour"] = time_value.hour
train_data["minute"] = time_value.minute
# 4.删除特征(時間戳)
train_data = train_data.drop(["time"], axis=1)
# 5.隻保留入住人數大于5的place,生成新的train_data
place_count = train_data.groupby("place_id").count()
place_count_r = place_count[place_count.row_id > 3].reset_index()
train_data = train_data[train_data["place_id"].isin(place_count_r["place_id"])]
# 提取特征值和目标值
x = train_data.drop(["place_id", "row_id"], axis=1)
y = train_data["place_id"]
# 分割資料集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
# 進行标準化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 執行個體化knn估計器
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
# 預測結果
y_predict = knn.predict(x_test)
# 列印準确率
print("準确率為:",knn.score(x_test, y_test))
return None
if __name__ == '__main__':
knncls()