spvec<-iris$Species|>levels()k.cent3$cluster<-k.cent3$cluster-1k.cent3$cluster<-if_else(k.cent3$cluster==0, 3, k.cent3$cluster)cbind(iris, cluster =spvec[k.cent3$cluster])|>gather(tag, value, 5:6)|>ggplot(aes(x =Sepal.Length, y =Sepal.Width, color =value))+geom_point(size =3)+facet_wrap(~tag)
32.5 c-means
c-means法はfuzzy c-means法とも呼ばれる、クラスターへの所属を確率で求める手法です。c-means法もk-means法と同じく、データとクラスターの距離をもとにクラスタリングを行う手法の一つです。Rでは、c-means法の計算をe1071パッケージ (Meyer et al. 2023)のcmeans関数で行うことができます。cmeans関数もkmeans関数と同様に、データフレームと中心の数(centers)を引数に取ります。クラスターの中心の初期値は行列で与えることもできます。
c-means法
pacman::p_load(e1071)# クラスタリングの計算(中心を指定)iris.c<-cmeans(iris[,1:4], centers =iris[c(1, 51, 101), 1:4])# クラスターとirisの種の比較cbind(iris, cluster =spvec[iris.c$cluster])|>gather(tag, value, 5:6)|>ggplot(aes(x =Sepal.Length, y =Sepal.Width, color =value))+geom_point(size =3)+facet_wrap(~tag)## Warning: attributes are not identical across measure variables; they will be## dropped
iris_temp<-iris|>filter(Species!="setosa")|>_[ ,1:3]iris.pc2<-prcomp(iris_temp)pc1<-iris.pc2$x[ ,1]pc2<-iris.pc2$x[ ,2]pc3<-iris.pc2$x[ ,2]center1<-iris.pc2$center[1]center2<-iris.pc2$center[2]center3<-iris.pc2$center[3]iris_temp$pc1_Sepal.Length<-pc1*iris.pc2$rotation[1, 1]+center1iris_temp$pc1_Sepal.Width<-pc1*iris.pc2$rotation[2, 1]+center2iris_temp$pc1_Petal.Length<-pc1*iris.pc2$rotation[3, 1]+center3iris_temp$pc2_Sepal.Length<-pc2*iris.pc2$rotation[1, 2]+center1iris_temp$pc2_Sepal.Width<-pc2*iris.pc2$rotation[2, 2]+center2iris_temp$pc2_Petal.Length<-pc2*iris.pc2$rotation[3, 2]+center3iris_temp$pc3_Sepal.Length<-pc3*iris.pc2$rotation[1, 3]+center1iris_temp$pc3_Sepal.Width<-pc3*iris.pc2$rotation[2, 3]+center2iris_temp$pc3_Petal.Length<-pc3*iris.pc2$rotation[3, 3]+center3pacman::p_load(plotly)fig<-plot_ly(iris_temp, x =~Sepal.Length, y =~Sepal.Width, z =~Petal.Length, type ="scatter3d", mode ="markers", name ='data', marker =list(size =2))|>add_trace( x =~pc1_Sepal.Length, y =~pc1_Sepal.Width, z =~pc1_Petal.Length, type ="scatter3d", mode ="markers+lines", name ='PC1', marker =list(color ="red", size =0.1), line =list(color ="red", width =5))|>add_trace( x =~pc2_Sepal.Length, y =~pc2_Sepal.Width, z =~pc2_Petal.Length, type ="scatter3d", mode ="markers+lines", name ='PC2', marker =list(color ="red", size =0.1), line =list(color ="blue", width =5))|>add_trace( x =~pc3_Sepal.Length, y =~pc3_Sepal.Width, z =~pc3_Petal.Length, type ="scatter3d", mode ="markers+lines", name ='PC3', marker =list(color="red", size=0.1), line =list(color="green", width=5))fig
prcomp関数の返り値をsummary関数の引数に取ると、標準偏差、分散の配分(Proportion of Variance)、積算の分配の配分(Cumulative Proportion)が示されます。このうち、分散の配分はその主成分によってばらつきをどれだけ説明できているかを示すものです。下の例では、PC1で92.5%程度のばらつきを示すことができている、つまり、PC1でデータの差を十分説明できていることがわかります。
summaryの結果
iris.pc|>summary()## Importance of components:## PC1 PC2 PC3 PC4## Standard deviation 2.0563 0.49262 0.2797 0.15439## Proportion of Variance 0.9246 0.05307 0.0171 0.00521## Cumulative Proportion 0.9246 0.97769 0.9948 1.00000
iris.pc<-prcomp(iris[ ,1:4])irispcd<-iris.pc$x|>as.data.frame()irispcd$Species<-iris$Speciesggplot(irispcd, aes(x =PC1, y =PC2, color =Species))+geom_point(size =2)
m1vs<-m1v$scores|>as.data.frame()# 計算したスコアをデータフレームに変換m1vs$student<-1:nrow(m1vs)# 生徒のIDを追加ggplot(m1vs, aes(x =Factor1, y =Factor2, color =student, label =student))+geom_text(size =3)+theme(legend.position ="none")+labs(title ="バリマックス回転", x ="理系度", y ="文系度")
m1ps<-m1p$scores|>as.data.frame()m1ps$student<-1:nrow(m1ps)ggplot(m1ps, aes(x =Factor1, y =Factor2, color =student, label =student))+geom_text(size =3)+theme(legend.position ="none")+labs(title ="プロマックス回転", x ="理系度", y ="文系度")
JPpos<-as.data.frame(JPpos)colnames(JPpos)<-c("x", "y")JPpos$prefecture<-prefectureggplot(JPpos, aes(x =-x, y =-y, label =prefecture, color =prefecture))+geom_text(size =3)+theme(legend.position ="none")
cmdscale関数を用いても多次元尺度法の計算を行うことができます。
cmdscale関数で多次元尺度法
JPpos2<-JPD%>%cmdscale%>%as.data.framecolnames(JPpos2)<-c("x", "y")JPpos2$prefecture<-prefectureggplot(JPpos2, aes(x =x, y =y, label =prefecture, color =prefecture))+geom_text(size =3)+theme(legend.position ="none")
Consortium, Japanese Archipelago Human Population Genetics. 2012. “The History of Human Populations in the Japanese Archipelago Inferred from Genome-Wide SNP Data with a Special Reference to the Ainu and the Ryukyuan Populations.”Journal of Human Genetics 57 (12): 787–95. https://cir.nii.ac.jp/crid/1523106605673180672.
Karatzoglou, Alexandros, Alex Smola, Kurt Hornik, and Achim Zeileis. 2004. “Kernlab – an S4 Package for Kernel Methods in R.”Journal of Statistical Software 11 (9): 1–20. https://doi.org/10.18637/jss.v011.i09.
Meyer, David, Evgenia Dimitriadou, Kurt Hornik, Andreas Weingessel, and Friedrich Leisch. 2023. E1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien. https://CRAN.R-project.org/package=e1071.