


这里使用的测试数据共包含40位人员照片,每个人10张照片。也可登陆http://www.cl.cam.ac.uk/research/dtg/attarchive/facesataglance.html 查看400张照片的缩略图。

import time 
import logging
from sklearn.datasets import fetch_olivetti_faceslogging.basicConfig(level = logging.INFO, format="%(asctime)s %(message)s") # 这里INFO必须大写data_home = 'code/datasets/'
faces = fetch_olivetti_faces(data_home=data_home)



import  numpy as npX = faces.data
y = faces.targettargets = np.unique(faces.target)
target_names = np.array(["p%d" % t for t in targets]) #给每个人做标签
n_targets = target_name.shape[0]
n_samples, h, w = faces.images.shapeprint('Samples count:{}\nTarget count:{}'.format(n_samples, n_targets))
print('Image size:{}x{}\nData shape:{}'.format(w, h, X.shape))
Samples count:400
Target count:40
Image size:64x64
Data shape:(400, 4096)




%matplotlib inline
from matplotlib import pyplot as plt
def plot_gallery(images, titles, h, w, n_row=2, n_col=5):
#     显示图片阵列:plt.figure(figsize=(2*n_col, 2.2*n_row),dpi=140)plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.01)for i in range(n_row * n_col):plt.subplot(n_row, n_col, i+1)plt.imshow(images[i].reshape((h,w)), cmap=plt.cm.gray)plt.title(titles[i])plt.axis('off')
n_row = 2
n_col = 6sample_images = None
sample_titles = []
for i in range(n_targets):people_images = X[y==i]  # 注意这里传入ipeople_sample_index = np.random.randint(0, people_images.shape[0], 1)people_sample_image = people_images[people_sample_index, :]if sample_images is not None:sample_images = np.concatenate((sample_images, people_sample_image), axis=0)else:sample_images =people_sample_imagesample_titles.append(target_names[i])   # 这里target_names是在前面生成的标签plot_gallery(sample_images, sample_titles, h, w, n_row, n_col)#代码中X[y=i]可以选择除特定的所有照片,随机选出来的照片放在sample.images数组对象里,最后调用之前定义的函数把照片画出来。

from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)



from time import time
from sklearn.svm import SVCt = time()
clf = SVC(class_weight='balanced')
clf.fit(X_train, y_train)
print("耗时:{}秒".format(time() - t))


from sklearn.metrics import confusion_matrixy_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=range(n_targets))
# np.set_printoptions(threshold=np.nan)
confusion_matrix:[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]



但是很明显输出结果效果很差。 因为confusion_matrix理想的输出是矩阵的对角线上有数组,其他地方都为0,而且这里很多图片都被预测成索引为12的类别了。我买再来看下classification_report的结果:

from sklearn.metrics import classification_reportprint(classification_report(y_test, y_pred, target_names = target_names)) #这里y_test和y_pred不要颠倒。
             precision    recall  f1-score   supportp0       0.00      0.00      0.00         1p1       0.00      0.00      0.00         3p2       0.00      0.00      0.00         2p3       0.00      0.00      0.00         1p4       0.00      0.00      0.00         1p5       0.00      0.00      0.00         1p6       0.00      0.00      0.00         4p7       0.00      0.00      0.00         2p8       0.00      0.00      0.00         4p9       0.00      0.00      0.00         2p10       0.00      0.00      0.00         1p11       0.00      0.00      0.00         0p12       0.00      0.00      0.00         4p13       0.00      0.00      0.00         4p14       0.00      0.00      0.00         1p15       0.00      0.00      0.00         1p16       0.00      0.00      0.00         3p17       0.00      0.00      0.00         2p18       0.00      0.00      0.00         2p19       0.00      0.00      0.00         2p20       0.00      0.00      0.00         1p21       0.00      0.00      0.00         2p22       0.00      0.00      0.00         3p23       0.00      0.00      0.00         2p24       0.00      0.00      0.00         3p25       0.00      0.00      0.00         3p26       0.00      0.00      0.00         2p27       0.00      0.00      0.00         2p28       0.00      0.00      0.00         0p29       0.00      0.00      0.00         2p30       0.00      0.00      0.00         2p31       0.00      0.00      0.00         3p32       0.00      0.00      0.00         2p33       0.00      0.00      0.00         2p34       0.00      0.00      0.00         0p35       0.00      0.00      0.00         2p36       0.00      0.00      0.00         3p37       0.00      0.00      0.00         1p38       0.00      0.00      0.00         2p39       0.00      0.00      0.00         2avg / total       0.00      0.00      0.00        80/Users/hadoop/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.'precision', 'predicted', average, warn_for)
/Users/hadoop/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1137: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.'recall', 'true', average, warn_for)









from sklearn.decomposition import PCAprint("Exploring explained variance ratio for dataset ...")
candidate_components = range(10, 300, 30)
explained_ratios = []
t = time()
for c in candidate_components:pca = PCA(n_components=c)X_pca = pca.fit_transform(X)explained_ratios.append(np.sum(pca.explained_variance_ratio_))
print('Done in {0:.2f}s'.format(time()-t))
Exploring explained variance ratio for dataset ...
Done in 2.17s
plt.figure(figsize=(8, 5), dpi=100)
plt.plot(candidate_components, explained_ratios)
plt.xlabel('Number of PCA Components')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained variance ratio for PCA')
plt.yticks(np.arange(0.5, 1.05, .05))
plt.xticks(np.arange(0, 300, 20));


def title_prefix(prefix, title):return "{}: {}".format(prefix, title)n_row = 1
n_col = 5sample_images = sample_images[0:5]
sample_titles = sample_titles[0:5]plotting_images = sample_images
plotting_titles = [title_prefix('orig', t) for t in sample_titles]
candidate_components = [120, 75, 37, 19, 8]
for c in candidate_components:print("Fitting and projecting on PCA(n_components={}) ...".format(c))t = time()pca = PCA(n_components=c)pca.fit(X)X_sample_pca = pca.transform(sample_images)X_sample_inv = pca.inverse_transform(X_sample_pca)plotting_images = np.concatenate((plotting_images, X_sample_inv), axis=0)sample_title_pca = [title_prefix('{}'.format(c), t) for t in sample_titles]plotting_titles = np.concatenate((plotting_titles, sample_title_pca), axis=0)print("Done in {0:.2f}s".format(time() - t))print("Plotting sample image with different number of PCA conpoments ...")
plot_gallery(plotting_images, plotting_titles, h, w,n_row * (len(candidate_components) + 1), n_col)
Fitting and projecting on PCA(n_components=120) ...
Done in 0.18s
Fitting and projecting on PCA(n_components=75) ...
Done in 0.14s
Fitting and projecting on PCA(n_components=37) ...
Done in 0.11s
Fitting and projecting on PCA(n_components=19) ...
Done in 0.07s
Fitting and projecting on PCA(n_components=8) ...
Done in 0.06s
Plotting sample image with different number of PCA conpoments ...



n_components = 120print("Fitting PCA by using training data ...")
t = time()
pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train)
print("Done in {0:.2f}s".format(time() - t))print("Projecting input data for PCA ...")
t = time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("Done in {0:.2f}s".format(time() - t))
Fitting PCA by using training data ...
Done in 0.16s
Projecting input data for PCA ...
Done in 0.01s
from sklearn.model_selection import GridSearchCVprint("Searching the best parameters for SVC ...")
param_grid = {'C': [1, 5, 10, 50],'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01]}
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid, verbose=2, n_jobs=4)# 参数n_jobs=4表示启动4个进程
clf = clf.fit(X_train_pca, y_train)
print("Best parameters found by grid search:")
Searching the best parameters for SVC ...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] C=1, gamma=0.0001 ...............................................
[CV] C=1, gamma=0.0001 ...............................................
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.1s
[CV] C=1, gamma=0.0005 ...............................................
[CV] ................................ C=1, gamma=0.0001, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................ C=1, gamma=0.0005, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................ C=1, gamma=0.0005, total=   0.1s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................ C=1, gamma=0.0005, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ................................. C=1, gamma=0.001, total=   0.1s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................................. C=1, gamma=0.001, total=   0.1s
[CV] C=5, gamma=0.0005 ...............................................
[CV] ................................. C=1, gamma=0.001, total=   0.1s
[CV] C=1, gamma=0.005 ................................................
[CV] .................................. C=1, gamma=0.01, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ................................ C=5, gamma=0.0001, total=   0.1s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................................ C=5, gamma=0.0005, total=   0.1s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.005, total=   0.1s
[CV] C=1, gamma=0.005 ................................................
[CV] .................................. C=1, gamma=0.01, total=   0.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ................................ C=5, gamma=0.0001, total=   0.1s
[CV] C=5, gamma=0.0005 ...............................................
[CV] ................................. C=5, gamma=0.001, total=   0.1s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.005, total=   0.1s
[CV] C=1, gamma=0.005 ................................................
[CV] ................................ C=5, gamma=0.0005, total=   0.1s
[CV] C=5, gamma=0.0005 ...............................................
[CV] .................................. C=1, gamma=0.01, total=   0.1s
[CV] C=5, gamma=0.0001 ...............................................
[CV] ................................. C=5, gamma=0.001, total=   0.1s
[CV] C=5, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.005, total=   0.1s
[CV] ................................ C=5, gamma=0.0005, total=   0.1s
[CV] C=5, gamma=0.005 ................................................
[CV] C=5, gamma=0.01 .................................................
[CV] ................................ C=5, gamma=0.0001, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ................................. C=5, gamma=0.001, total=   0.1s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................. C=5, gamma=0.005, total=   0.1s
[CV] C=5, gamma=0.005 ................................................
[CV] .................................. C=5, gamma=0.01, total=   0.1s
[CV] C=5, gamma=0.01 .................................................
[CV] ............................... C=10, gamma=0.0001, total=   0.1s
[CV] C=10, gamma=0.0005 ..............................................
[CV] ................................ C=10, gamma=0.001, total=   0.1s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................. C=5, gamma=0.005, total=   0.1s
[CV] C=5, gamma=0.005 ................................................
[CV] ............................... C=10, gamma=0.0005, total=   0.1s
[CV] C=10, gamma=0.0005 ..............................................
[CV] .................................. C=5, gamma=0.01, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ................................ C=10, gamma=0.001, total=   0.1s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................. C=5, gamma=0.005, total=   0.1s
[CV] C=5, gamma=0.01 .................................................
[CV] ............................... C=10, gamma=0.0001, total=   0.1s
[CV] C=10, gamma=0.0001 ..............................................
[CV] ............................... C=10, gamma=0.0005, total=   0.1s
[CV] C=10, gamma=0.0005 ..............................................
[CV] ................................ C=10, gamma=0.001, total=   0.1s
[CV] C=10, gamma=0.005 ...............................................
[CV] .................................. C=5, gamma=0.01, total=   0.1s
[CV] C=10, gamma=0.005 ...............................................
[CV] ............................... C=10, gamma=0.0001, total=   0.1s
[CV] C=10, gamma=0.01 ................................................
[CV] ............................... C=10, gamma=0.0005, total=   0.1s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ................................ C=10, gamma=0.005, total=   0.1s
[CV] C=50, gamma=0.001 ...............................................
[CV] ................................ C=10, gamma=0.005, total=   0.1s
[CV] C=10, gamma=0.005 ...............................................
[CV] ................................. C=10, gamma=0.01, total=   0.1s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ............................... C=50, gamma=0.0005, total=   0.1s
[CV] C=50, gamma=0.0005 ..............................................
[CV] ................................ C=50, gamma=0.001, total=   0.1s
[CV] C=50, gamma=0.001 ...............................................
[CV] ............................... C=50, gamma=0.0001, total=   0.1s
[CV] ................................ C=10, gamma=0.005, total=   0.1s
[CV] C=10, gamma=0.01 ................................................
[CV] ............................... C=50, gamma=0.0005, total=   0.1s
[CV] C=50, gamma=0.0001 ..............................................
[CV] C=50, gamma=0.0005 ..............................................
[CV] ................................ C=50, gamma=0.001, total=   0.1s
[CV] ................................. C=10, gamma=0.01, total=   0.1s
[CV] C=10, gamma=0.01 ................................................
[CV] C=50, gamma=0.005 ...............................................
[CV] ............................... C=50, gamma=0.0001, total=   0.1s
[CV] C=50, gamma=0.0001 ..............................................
[CV] ............................... C=50, gamma=0.0005, total=   0.1s
[CV] C=50, gamma=0.001 ...............................................
[CV] ................................. C=10, gamma=0.01, total=   0.1s
[CV] ................................ C=50, gamma=0.005, total=   0.1s
[CV] C=50, gamma=0.005 ...............................................
[CV] C=50, gamma=0.005 ...............................................
[CV] ............................... C=50, gamma=0.0001, total=   0.1s
[CV] ................................ C=50, gamma=0.001, total=   0.1s
[CV] ................................ C=50, gamma=0.005, total=   0.1s
[CV] ................................ C=50, gamma=0.005, total=   0.1s
[CV] C=50, gamma=0.01 ................................................
[CV] ................................. C=50, gamma=0.01, total=   0.0s
[CV] C=50, gamma=0.01 ................................................
[CV] ................................. C=50, gamma=0.01, total=   0.0s
[CV] C=50, gamma=0.01 ................................................
[CV] ................................. C=50, gamma=0.01, total=   0.0s
Best parameters found by grid search:
{'C': 10, 'gamma': 0.0005}[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:    1.9s finished



t = time()
y_pred = clf.best_estimator_.predict(X_test_pca)
cm = confusion_matrix(y_test, y_pred, labels=range(n_targets))
print("Done in {0:.2f}.\n".format(time()-t))
print("confusion matrix:")
Done in 0.01.confusion matrix:
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0][0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names)) #这里注意y_test和y_pred位置不要颠倒
             precision    recall  f1-score   supportp0       1.00      1.00      1.00         1p1       1.00      1.00      1.00         3p2       1.00      0.50      0.67         2p3       1.00      1.00      1.00         1p4       1.00      1.00      1.00         1p5       1.00      1.00      1.00         1p6       1.00      0.75      0.86         4p7       1.00      1.00      1.00         2p8       1.00      1.00      1.00         4p9       1.00      1.00      1.00         2p10       1.00      1.00      1.00         1p11       1.00      1.00      1.00         4p12       1.00      1.00      1.00         4p13       1.00      1.00      1.00         1p14       1.00      1.00      1.00         1p15       0.75      1.00      0.86         3p16       1.00      1.00      1.00         2p17       1.00      1.00      1.00         2p18       1.00      1.00      1.00         2p19       1.00      1.00      1.00         1p20       1.00      1.00      1.00         2p21       1.00      1.00      1.00         3p22       1.00      1.00      1.00         2p23       1.00      1.00      1.00         3p24       0.75      1.00      0.86         3p25       1.00      1.00      1.00         2p26       1.00      1.00      1.00         2p27       1.00      1.00      1.00         2p28       1.00      1.00      1.00         2p29       1.00      1.00      1.00         3p30       1.00      1.00      1.00         2p31       1.00      1.00      1.00         2p32       1.00      1.00      1.00         2p33       1.00      1.00      1.00         3p34       1.00      1.00      1.00         1p35       1.00      1.00      1.00         2p36       1.00      1.00      1.00         2avg / total       0.98      0.97      0.97        80/Users/hadoop/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1428: UserWarning: labels size, 37, does not match size of target_names, 40.format(len(labels), len(target_names))






