defpca(dataMat,topNfeat=9999999):meanVals=np.mean(dataMat,axis=0)#print(meanVals)meanRemoved=dataMat-meanVals#remove mean#print(meanRemoved)covMat=np.cov(meanRemoved,rowvar=0)eigvals,eigvects=np.linalg.eig(np.mat(covMat))#计算协方差矩阵的特征值和特征向量eig_valind=np.argsort(eigvals)#sort, sort goes smallest to largesteig_valind=eig_valind[:-(topNfeat+1):-1]#cut off unwanted dimensionsred_eigvects=eigvects[:,eig_valind]#reorganize eig vects largest to smallestlow_datamat=meanRemoved*red_eigvects#transform data into new dimensionsreconmat=(low_datamat*red_eigvects.T)+meanValsreturnlow_datamat,reconmat
importnumpyasnpimportmatplotlib.pyplotaspltdefloadDataSet(fileName,delim='\t'):fr=open(fileName)stringArr=[line.strip().split(delim)forlineinfr.readlines()]datArr=[list(map(float,line))forlineinstringArr]#print(mat(datArr))fr.close()returnnp.mat(datArr)defpca(dataMat,topNfeat=9999999):meanVals=np.mean(dataMat,axis=0)#print(meanVals)meanRemoved=dataMat-meanVals#remove mean#print(meanRemoved)covMat=np.cov(meanRemoved,rowvar=0)eigvals,eigvects=np.linalg.eig(np.mat(covMat))#计算协方差矩阵的特征值和特征向量eig_valind=np.argsort(eigvals)#sort, sort goes smallest to largesteig_valind=eig_valind[:-(topNfeat+1):-1]#cut off unwanted dimensionsred_eigvects=eigvects[:,eig_valind]#reorganize eig vects largest to smallestlow_datamat=meanRemoved*red_eigvects#transform data into new dimensionsreconmat=(low_datamat*red_eigvects.T)+meanValsreturnlow_datamat,reconmatdefplotBestFit(dataSet1,dataSet2):dataArr1=np.array(dataSet1)dataArr2=np.array(dataSet2)n=np.shape(dataArr1)[0]n1=shape(dataArr2)[0]xcord1=[];ycord1=[]xcord2=[];ycord2=[]xcord3=[];ycord3=[]j=0foriinrange(n):xcord1.append(dataArr1[i,0]);ycord1.append(dataArr1[i,1])xcord2.append(dataArr2[i,0]);ycord2.append(dataArr2[i,1])fig=plt.figure()ax=fig.add_subplot(111)ax.scatter(xcord1,ycord1,s=30,c='red',marker='s')ax.scatter(xcord2,ycord2,s=30,c='green')plt.xlabel('X1');plt.ylabel('X2');plt.show()if__name__=='__main__':mata=loadDataSet('score')a,b=pca(mata,4)plotBestFit(a,b)