







Study with the several resources on Docsity
Earn points by helping other students or get them with a premium plan
Prepare for your exams
Study with the several resources on Docsity
Earn points to download
Earn points by helping other students or get them with a premium plan
Community
Ask the community for help and clear up your study doubts
Discover the best universities in your country according to Docsity users
Free resources
Download our free guides on studying techniques, anxiety management strategies, and thesis advice from Docsity tutors
Introduction to Statistical Learning (James/Witten/Hastie/Tibshirani)
Typology: Exercises
1 / 13
This page cannot be seen from the preview
Don't miss anything!
#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() DM = matrix( data=c(0.0,0.3,0.4,0.7,0.3,0.0,0.5,0.8,0.4,0.5,0.0,0.45,0.7,0.8,0.45, .0), nrow=4, ncol=4, byrow=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_2_complete_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust( as.dist( DM ), method="complete" ), main="complete linkage" ) if( save_plots ){ dev.off() } if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_2_single_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust( as.dist( DM ), method="single" ), main="single linkage" ) if( save_plots ){ dev.off() }
#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
DF = data.frame( x1=c(1,1,0,5,6,4), x2=c(4,3,4,1,2,0) ) n = dim(DF)[1] K = 2
labels = sample( 1:K, n, replace=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_3_initial_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( DF$x1, DF$x2, cex=2.0, pch=19, col=(labels+1), xlab="gene index", ylab="unpaired t-value" ) grid() if( save_plots ){ dev.off() } while( TRUE ){
cents = matrix( nrow=K, ncol=2 ) for( l in 1:K ){ samps = labels==l cents[l,] = apply( DF[samps,], 2, mean ) }
new_labels = rep( NA, n ) for( si in 1:n ){ smallest_norm = +Inf for( l in 1:K ){ nm = norm( as.matrix( DF[si,] - cents[l,] ), type="2" )
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
n = 1000 # the number of genes m = 100 # the number of tissue samples ##n = 20 ##m = 5 mu_A = 0.1 ## the machines are mostly the same but a bit different sigma_A = 1. mu_B = -0. sigma_B = 4. mu_C = 0.0 # the control and the treatment means are similar mu_T = 0. X = matrix(0, nrow=n, ncol=m) prob_of_machine_A = seq(1, 1.e-6, length.out=m) machine = c() treatment = c() for( jj in 1:m ){
machine_used = sample(c('A', 'B'), size=1, prob=c(prob_of_machine_A[jj], 1-prob_of_machine_A[jj])) machine = c(machine, machine_used)
type = sample(c('C', 'T'), size=1, prob=c(0.5, 0.5)) treatment = c(treatment, type) if( machine_used=='A' ){ if( type=='C' ){ x = rnorm(n, mean=(mu_A+mu_C), sd=sigma_A) }else{ x = rnorm(n, mean=(mu_A+mu_T), sd=sigma_A)
}else{ if( type=='C' ){ x = rnorm(n, mean=(mu_B+mu_C), sd=sigma_B) }else{ x = rnorm(n, mean=(mu_B+mu_T), sd=sigma_B) } } X[, jj] = x } pr.out = prcomp(X, scale=TRUE) ##print(summary(pr.out))
print(pr.out$sdev[1:10]^2/sum(pr.out$sdev^2))
X_transformed = X - pr.out$x[,1] %*% t(pr.out$rotation[, 1])
print(t.test(X_transformed[, treatment=='C'], X_transformed[, treatment=='T']))
machine_A = machine=='A' X_A = X[, machine_A] machine_B = machine=='B' X_B = X[, machine_B] print(sprintf('mu_A= %f; mean(X_A)= %f; mu_B= %f; mean(X_B)= %f', mu_A, mean(X_A), mu_B, mean(X_B))) X_2 = cbind(X_A - mean(X_A), X_B - mean(X_B)) pr.out = prcomp(X_2, scale=TRUE) ##print(summary(pr.out))
print(pr.out$sdev[1:10]^2/sum(pr.out$sdev^2))
print(t.test(X_2[, treatment=='C'], X_2[, treatment=='T']))
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen() pr.out = prcomp(USArrests, scale=TRUE)
pr.var = pr.out$sdev^ pve_1 = pr.var / sum(pr.var)
USArrests_scaled = scale( USArrests ) denom = sum( apply( USArrests_scaled^2, 2, sum ) ) Phi = pr.out$rotation USArrests_projected = USArrests_scaled %*% Phi # this is the same as pr.out$x numer = apply( pr.out$x^2, 2, sum ) pve_2 = numer / denom print(pve_1) print(pve_2) print(pve_1 - pve_2)
#----- save_plots = F set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
hclust.complete = hclust( dist(USArrests), method="complete" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_9_complete_linkage.eps", onefile=FALSE, horizontal=FALSE) } plot( hclust.complete, xlab="", sub="", cex=0.9 ) if( save_plots ){ dev.off() } #cutree( hclust.complete, h=150 ) # height we cut at ct = cutree( hclust.complete, k=3 ) # number of clusters to cut into
for( k in 1:3 ){ print(k) print( rownames( USArrests )[ ct == k ] ) }
hclust.complete.scale = hclust( dist(scale(USArrests,center=FALSE)), method="complete" ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_9_complete_linkage_w_scaling .eps", onefile=FALSE, horizontal=FALSE) } plot( hclust.complete.scale, xlab="", sub="", cex=0.9 ) if( save_plots ){ dev.off() } ct = cutree( hclust.complete.scale, k=3 ) # number of clusters to cut into
#----- save_plots = FALSE set.seed(0) vt100ClearScreen <- function(...) cat("\033[2J") vt100ClearScreen()
K = 3 # the number of classes n = 20 # the number of samples per class p = 50 # the number of variables
X_1 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_1[row,] = X_1[row,] + rep( 1, p ) }
X_2 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_2[row,] = X_2[row,] + rep( -1, p ) }
X_3 = matrix( rnorm(n*p), nrow=n, ncol=p ) for( row in 1:n ){ X_3[row,] = X_3[row,] + c( rep( +1, p/2 ), rep( -1, p/2 ) ) } X = rbind( X_1, X_2, X_3 ) labels = c( rep(1,n), rep(2,n), rep(3,n) ) # the "true" labels of the points pr.out = prcomp( X, scale=TRUE ) if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_10_pr_plot.eps", onefile=FALSE, horizontal=FALSE) } plot( pr.out$x[,1], pr.out$x[,2], col=labels, pch=19 ) grid() if( save_plots ){ dev.off() }
kmean.out = kmeans( X, centers=3, nstart=50 ) table( kmean.out$cluster, labels )
kmean.out = kmeans( X, centers=2, nstart=50 )
kmean.out = kmeans( X, centers=4, nstart=50 )
kmean.out = kmeans( pr.out$x[,c(1,2)], centers=3, nstart=50 )
Xs = scale( X ) kmean.out = kmeans( Xs, centers=3, nstart=50 )
n2 = apply( DF[ predicted==2, ], 2, length ) m1 = apply( DF[ predicted==1, ], 2, mean ) # the means across the 1000 genes in each cluster m2 = apply( DF[ predicted==2, ], 2, mean ) v1 = apply( DF[ predicted==1, ], 2, var ) # the variances across the 1000 genes in each cluster v2 = apply( DF[ predicted==2, ], 2, var ) pooled_variance = sqrt( v1 / n1 + v2 / n2 ) t_value = ( m1 - m2 ) / pooled_variance if( save_plots ){ postscript("../../WriteUp/Graphics/Chapter10/prob_11_t_value.eps", onefile=FALSE, horizontal=FALSE) } plot( t_value, xlab="gene index", ylab="unpaired t-value" ) if( save_plots ){ dev.off() }