PCA使用ggbiplot进行缩放 [英] PCA Scaling with ggbiplot

查看:1567
本文介绍了PCA使用ggbiplot进行缩放的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图用 prcomp ggbiplot 来绘制一个主成分分析。我正在获取单位圆外的数据值,并且无法在调用 prcomp 之前重新调整数据的大小,以便可以将数据限制为(酒)
需要(ggbiplot)
wine.pca = prcomp(单位)。

  wine [,1:3],scale。= TRUE)
ggbiplot(wine.pca,obs.scale = 1,
var.scale = 1,groups = wine.class,ellipse = TRUE,circle = TRUE)

在调用 prcomp :

  wine2 = wine [,1:3] 
mean = apply (wine2,2,mean)
sd = apply(wine2,2,mean)
for(i in 1:ncol(wine2)){
wine2 [,i] =(wine2 [ (wine2,scale。= TRUE)
ggbiplot(wine2.pca,obs.scale = 1)[b]

wine2.pca = 1,
var.scale = 1,groups = wine.class,ellipse = TRUE,circle = TRUE)

ggbiplot 安装包如下:

  r equire(devtools)
install_github('ggbiplot','vqv')

代码块:





根据@Brian Hanson的评论,我添加了一个反映我想要获得的输出的额外图像。



解决方案


$ b

  ggbiplot2 

我编辑了plot函数的代码,并能够获得我想要的功能。 = function(pcobj,choices = 1:2,scale = 1,pc.biplot = TRUE,
obs.scale = 1 - scale,var.scale = scale,
groups = NULL,ellipse = FALSE ,ellipse.prob = 0.68,
labels = NULL,labels.size = 3,alpha = 1,
var.axes = TRUE,
circle = FALSE,circle.prob = 0.69,
varname.size = 3,varna me.adjust = 1.5,
varname.abbrev = FALSE,...)
{
library(ggplot2)
library(plyr)
library(scales)
库(grid)

stopifnot(长度(选择)== 2)

#恢复SVD
if(inherits(pcobj,'prcomp' )){
nobs.factor< - sqrt(nrow(pcobj $ x)-1)
d< - pcobj $ sdev
u< - 扫描(pcobj $ x,2,1 /(d * nobs.factor),FUN ='*')
v< - pcobj $ rotation
} else if(inherits(pcobj,'princomp')){
nobs.factor < - sqrt(pcobj $ n.obs)
d< - pcobj $ sdev
u< - 扫描(pcobj $分数,2,1 /(d * nobs.factor),FUN ='* ')
v < - pcobj $ loadings
} else if(继承(pcobj,'PCA')){
nobs.factor< - sqrt(nrow(pcobj $ call $ X) )
d < - unlist(sqrt(pcobj $ eig)[1])$ ​​b $ bu < - sweep(pcobj $ ind $ coord,2,1 /(d * nobs.factor),FUN =' *')
v < - 扫描(pcobj $ var $ coord,2,sqrt(pcobj $ eig [1:ncol(pcobj $ var $ coord),1] ),FUN =/)
} else {
stop('期望类prcomp,princomp或PCA'的对象)
}

#得分
df.u< - as.data.frame(sweep(u [,choices],2,d [choices] ^ obs.scale,FUN ='*'))


v< - sweep(v,2,d ^ var.scale,FUN ='*')
df.v < - as.data.frame(v [,choices])

名称(df.u)< - c('xvar','yvar')
名称(df.v)< - 名称(df.u)

if(pc.biplot){
df.u< - df.u * nobs.factor
}

#缩放相关圆的半径,使它对应于标准化PC分数的
#a数据椭圆
r < - 1

#比例方向
v.scale < - rowSums(v ^ 2)
df.v < - df.v / sqrt(max(v.scale))

##比例分数
r.scale = sqrt(max(df.u [,1] ^ 2 + df.u [,2] ^ 2))
df.u = .99 * df.u / r.scale

#更改轴
if(obs.scale == 0){
u.axis.labs< - paste('s ('PC',choices,sep ='')
}

#将解释的方差比例附加到轴标签上
u.axis.labs < - paste(u.axis.labs,
sprintf('(%0.1f %%解释变量)',
100 * pcobj $ sdev [options] ^ 2 / sum(pcobj $ sdev ^ 2)))

#得分标签
if(!is .nu​​ll(标签)){
df.u $标签< - 标签
}

#分组变量
if(!is.null(groups)) {
df.u $ groups< - groups
}

#变量名称
if(varname.abbrev){
df.v $ varname < - abbreviate(rownames(v))
} else {
df.v $ varname< - rownames(v)
}

#文本的变量(df.v,(180 / pi)* atan(yvar / xvar))
df.v $ hjust = with(df.v,( 1 - varname.adjust * sign(xvar))/ 2)

#底图
g < - ggplot(data = df.u,aes(x = xvar,y = yvar))+
xlab(u.axis.labs [1])+ ylab(u.axis.labs [2] )+ coord_equal()

if(var.axes){
#绘制圆圈
如果(圆圈)
{
theta< - c seq(-pi,pi,length = 50),seq(pi,-pi,length = 50))
circle < - data.frame(xvar = r * cos(theta),yvar = r * sin (theta))
g < - g + geom_path(data = circle,color = muted('white'),
size = 1/2,alpha = 1/3)
}

#绘制方向
g < - g +
geom_segment(data = df.v,
aes(x = 0,y = 0,xend = xvar,yend = yvar),
arrow =箭头(长度=单位(1/2,'picas')),
color =静音('red'))
}

#如果(!is.null(df.u $ labels)){
if(!is.null(df.u $ groups)){
g,绘制标签或点
< - g + geom_text(aes(label = labels,color = groups),
size = labels.size)
} else {
g< -g + geom_text(aes(label = labels),size = labels.size)
}
} else {
if(!is.null(df.u $ groups)){
g < - g + geom_point(aes(color = groups),alpha = alpha)
} else {
g <-g + geom_point(alpha = alpha)
}
}

#如果存在组
,覆盖浓度椭圆if(!is.null( df.u $ groups)&&椭圆){
theta <-c(seq(-pi,pi,length = 50),seq(pi,-pi,length = 50))
circle <-cbind(cos(theta ),sin(theta))

ell < - ddply(df.u,'groups',function(x){
if(nrow(x)<2){
return(NULL)
} else if(nrow(x)== 2){
sigma< - var(cbind(x $ xvar,x $ yvar))
} (x(x $ xvar),var(x $ yvar)))
}
mu< -c(平均值(x $ xvar) ,平均(x $ yvar))
ed< - sqrt(qchisq(ellipse.prob,df = 2))
data.frame(sweep(circle%*%chol sigma)* ed, 2,mu,FUN ='+'),
groups = x $ groups [1])$ ​​b $ b})
names(ell)[1:2]< - c('xvar ','yvar')
g < - g + geom_path(data = ell,aes(color = groups,group = groups))
}

#标记变量轴
if(var.axes){
g< - g +
geom_text(data = df.v,
aes(label = varname,x = xvar ,y = yvar,
angle = angle,hjust = hjust),
color ='darkred',size = varname.size)
}
#更改图例的名称对于组
#if(!is.null(groups)){
#g < - g + scale_color_brewer(name = deparse(substitute(groups)),
#palette ='Dark2 ')
#}

#TODO:添加第二组轴

返回(g)
}


I'm trying to plot a principal component analysis using prcomp and ggbiplot. I'm getting data values outside of the unit circle, and haven't been able to rescale the data prior to calling prcomp in such a way that I can constrain the data to the unit circle.

data(wine)
require(ggbiplot)
wine.pca=prcomp(wine[,1:3],scale.=TRUE)
ggbiplot(wine.pca,obs.scale = 1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

I tried scaling by subtracting mean and dividing by standard deviation before calling prcomp:

wine2=wine[,1:3]
mean=apply(wine2,2,mean)
sd=apply(wine2,2,mean)
for(i in 1:ncol(wine2)){
  wine2[,i]=(wine2[,i]-mean[i])/sd[i]
}
wine2.pca=prcomp(wine2,scale.=TRUE)
ggbiplot(wine2.pca,obs.scale=1, 
         var.scale=1,groups=wine.class,ellipse=TRUE,circle=TRUE)

ggbiplot package installed as follows:

require(devtools)
install_github('ggbiplot','vqv')

Output of either code chunk:

Per @Brian Hanson's comment below, I'm adding an additional image reflecting the output I'm trying to get.

解决方案

I edited the code for the plot function and was able to get the functionality I wanted.

ggbiplot2=function(pcobj, choices = 1:2, scale = 1, pc.biplot = TRUE, 
         obs.scale = 1 - scale, var.scale = scale, 
         groups = NULL, ellipse = FALSE, ellipse.prob = 0.68, 
         labels = NULL, labels.size = 3, alpha = 1, 
         var.axes = TRUE, 
         circle = FALSE, circle.prob = 0.69, 
         varname.size = 3, varname.adjust = 1.5, 
         varname.abbrev = FALSE, ...)
{
  library(ggplot2)
  library(plyr)
  library(scales)
  library(grid)

  stopifnot(length(choices) == 2)

  # Recover the SVD
  if(inherits(pcobj, 'prcomp')){
    nobs.factor <- sqrt(nrow(pcobj$x) - 1)
    d <- pcobj$sdev
    u <- sweep(pcobj$x, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$rotation
  } else if(inherits(pcobj, 'princomp')) {
    nobs.factor <- sqrt(pcobj$n.obs)
    d <- pcobj$sdev
    u <- sweep(pcobj$scores, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- pcobj$loadings
  } else if(inherits(pcobj, 'PCA')) {
    nobs.factor <- sqrt(nrow(pcobj$call$X))
    d <- unlist(sqrt(pcobj$eig)[1])
    u <- sweep(pcobj$ind$coord, 2, 1 / (d * nobs.factor), FUN = '*')
    v <- sweep(pcobj$var$coord,2,sqrt(pcobj$eig[1:ncol(pcobj$var$coord),1]),FUN="/")
  } else {
    stop('Expected a object of class prcomp, princomp or PCA')
  }

  # Scores
  df.u <- as.data.frame(sweep(u[,choices], 2, d[choices]^obs.scale, FUN='*'))

  # Directions
  v <- sweep(v, 2, d^var.scale, FUN='*')
  df.v <- as.data.frame(v[, choices])

  names(df.u) <- c('xvar', 'yvar')
  names(df.v) <- names(df.u)

  if(pc.biplot) {
    df.u <- df.u * nobs.factor
  }

  # Scale the radius of the correlation circle so that it corresponds to 
  # a data ellipse for the standardized PC scores
  r <- 1

  # Scale directions
  v.scale <- rowSums(v^2)
  df.v <- df.v / sqrt(max(v.scale))

  ## Scale Scores
  r.scale=sqrt(max(df.u[,1]^2+df.u[,2]^2))
  df.u=.99*df.u/r.scale

  # Change the labels for the axes
  if(obs.scale == 0) {
    u.axis.labs <- paste('standardized PC', choices, sep='')
  } else {
    u.axis.labs <- paste('PC', choices, sep='')
  }

  # Append the proportion of explained variance to the axis labels
  u.axis.labs <- paste(u.axis.labs, 
                       sprintf('(%0.1f%% explained var.)', 
                               100 * pcobj$sdev[choices]^2/sum(pcobj$sdev^2)))

  # Score Labels
  if(!is.null(labels)) {
    df.u$labels <- labels
  }

  # Grouping variable
  if(!is.null(groups)) {
    df.u$groups <- groups
  }

  # Variable Names
  if(varname.abbrev) {
    df.v$varname <- abbreviate(rownames(v))
  } else {
    df.v$varname <- rownames(v)
  }

  # Variables for text label placement
  df.v$angle <- with(df.v, (180/pi) * atan(yvar / xvar))
  df.v$hjust = with(df.v, (1 - varname.adjust * sign(xvar)) / 2)

  # Base plot
  g <- ggplot(data = df.u, aes(x = xvar, y = yvar)) + 
    xlab(u.axis.labs[1]) + ylab(u.axis.labs[2]) + coord_equal()

  if(var.axes) {
    # Draw circle
    if(circle) 
    {
      theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
      circle <- data.frame(xvar = r * cos(theta), yvar = r * sin(theta))
      g <- g + geom_path(data = circle, color = muted('white'), 
                         size = 1/2, alpha = 1/3)
    }

    # Draw directions
    g <- g +
      geom_segment(data = df.v,
                   aes(x = 0, y = 0, xend = xvar, yend = yvar),
                   arrow = arrow(length = unit(1/2, 'picas')), 
                   color = muted('red'))
  }

  # Draw either labels or points
  if(!is.null(df.u$labels)) {
    if(!is.null(df.u$groups)) {
      g <- g + geom_text(aes(label = labels, color = groups), 
                         size = labels.size)
    } else {
      g <- g + geom_text(aes(label = labels), size = labels.size)      
    }
  } else {
    if(!is.null(df.u$groups)) {
      g <- g + geom_point(aes(color = groups), alpha = alpha)
    } else {
      g <- g + geom_point(alpha = alpha)      
    }
  }

  # Overlay a concentration ellipse if there are groups
  if(!is.null(df.u$groups) && ellipse) {
    theta <- c(seq(-pi, pi, length = 50), seq(pi, -pi, length = 50))
    circle <- cbind(cos(theta), sin(theta))

    ell <- ddply(df.u, 'groups', function(x) {
      if(nrow(x) < 2) {
        return(NULL)
      } else if(nrow(x) == 2) {
        sigma <- var(cbind(x$xvar, x$yvar))
      } else {
        sigma <- diag(c(var(x$xvar), var(x$yvar)))
      }
      mu <- c(mean(x$xvar), mean(x$yvar))
      ed <- sqrt(qchisq(ellipse.prob, df = 2))
      data.frame(sweep(circle %*% chol(sigma) * ed, 2, mu, FUN = '+'), 
                 groups = x$groups[1])
    })
    names(ell)[1:2] <- c('xvar', 'yvar')
    g <- g + geom_path(data = ell, aes(color = groups, group = groups))
  }

  # Label the variable axes
  if(var.axes) {
    g <- g + 
      geom_text(data = df.v, 
                aes(label = varname, x = xvar, y = yvar, 
                    angle = angle, hjust = hjust), 
                color = 'darkred', size = varname.size)
  }
  # Change the name of the legend for groups
  # if(!is.null(groups)) {
  #   g <- g + scale_color_brewer(name = deparse(substitute(groups)), 
  #                               palette = 'Dark2')
  # }

  # TODO: Add a second set of axes

  return(g)
}

这篇关于PCA使用ggbiplot进行缩放的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆