1. R gibbspanel function
library(MCMCpack)
library(mnormt)
### reading simulated data set 1###
data1<-read.table("data1.txt")
cluster<-rep(1,117)
d11<-cbind(cluster,data1[,2:12])
### Function gibbspanel:Bayesian analysis of AR(p) panel data #(Eq. 1 at Bioinformatics paper)
gibbspanel<-function(data1,iter,nlag,burnin)
{
m<-nrow(data1)
p<-2 #order of AR(p)
m=nrow(data1)
n=ncol(data1)-2
### matrix to keep the generated MCMC values###
fi_est<-matrix(0,nrow=iter,(nrow(data1)*p)+nrow(data1))
sigma_e<-matrix(0, nrow=iter,1)
### MCMC initial values ###
fi_est[1,]<- t(t(fi_est[1,]))
sigma_e[1]<-0.04
data_cluster<-matrix(0,nrow=nrow(data1),p+2)
### Hyperparameters values (mu, P, alpha, beta) ###
P=diag(0.05,(m*p)+m,(m*p)+m)
mu=matrix(0.9,(m*p)+m,1)
alpha1=1000
beta1=50
### Gene expression (fold-change) values ###
y<-data1[,3:ncol(data1)]
### Function gibbspanel codes###
b<-matrix(t(y[,p:(n-1)]),m*(n-p),1)
c<-matrix(t(y[,1:(n-p)]),m*(n-p),1)
X_aux<-matrix(0,m*(n-p),m*(p+1))
M<-matrix(1,n-p,3)
X_auxi<-kronecker(diag(1, m), M)
Xi<-matrix(cbind(1,b,c),m*(n-p),m*(p+1))
X<-Xi*X_auxi
Y1<-matrix(t(y[,(p+1):n]),m*(n-p),1)
sigma_M<-t(X)%*%X+solve(P)
fiB<-solve(sigma_M)%*%(t(X)%*%Y1+P%*%mu)
D<-beta1 +0.5*((t(Y1)%*%Y1+t(mu)%*%solve(P)%*%mu)
-t((t(X)%*%Y1+solve(P)%*%mu))%*%
solve(sigma_M)%*%(t(X)%*%Y1+solve(P)%*%mu))
shape1<-0.5*((m*(n+1))+(2*alpha1))
for(iter in 2:iter)
{
fi_est[iter,]<-mvrnorm(n=1, fiB, sigma_e[iter-1]*sigma_M)
sigma_e[iter,]<-rinvgamma(1,shape1,D+0.5*(t(t(t(fi_est[iter,]))-fiB)%*%solve(sigma_M)%*%
(t(t(fi_est[iter,]))-fiB)))
}
a<<-fi_est
b<<-sigma_e
h<-as.data.frame(fi_est)
media1<-colMeans(h[burnin:iter,])
media<-matrix(media1,1,(m*p)+m)
precisao<-mean(sigma_e[burnin:iter,])
sigma_e1_cluster<-rep(precisao,nrow(data1))
data_cluster[,4]<-t(t(sigma_e1_cluster))
data_cluster[,1]<-t(media[1,c(seq(1,3*m,3))])
data_cluster[,2]<-t(media[1,c(seq(2,(3*m)-1,3))])
data_cluster[,3]<-t(media[1,c(seq(3,(3*m),3))])
data_cluster1<-cbind(data1[,2],data1[,1],data_cluster)
return(list(data_cluster1=data_cluster1))
}
### Number of cluster via Mojena (1977) method ###
### Function clustering ###
clustering<-function(clust_data,data1)
{
mdist<-dist(clust_data[,3:6], method="euclidean")
agrup<-hclust(mdist^2,method ="ward")
# plot(agrup)
mojena=mean(agrup$height)+1.25*sd(agrup$height)
k=1+length(agrup$height[agrup$height>mojena])
# rect.hclust(agrup, k = k)
clusters<-cutree(agrup, k=k)
clusters1<<-as.data.frame(clusters)
data1<-cbind(clusters,data1[,-1] )
return(list(data1=data1,clusters=clusters))
}
### proposed clustering ###
cluster_teste<-as.matrix(0,101,nrow(d11))
i<-2
repeat
x<-gibbspanel(d11,10,2,5)
agr<-clustering(x$data_cluster1,d11)
cluster_teste[i,]<-as.matrix(t(agr$clusters))
x1<-by(agr$data1,agr$data1[,1], function(x) gibbspanel(x,10,2,5))
x2<-do.call("rbind",x1)
x2<-x2[order(x2[,1]),]
clusters1<-t(t(agr$cluster))
d11<-cbind(clusters1,d11[,-1])
if(sum(cluster_teste[i,]-cluster_teste[i-1,])==0)
break
i<-i+1
}
x2[,2]
### percentage of correct clustering ###
simu<-c(rep(1,23),rep(2,32),rep(3,15),rep(4,24),rep(5,23))
comp<-cbind(x2[,2],simu)
cont<-0
for ( i in 1:117)
{
if(comp[i,1]==comp[i,2])
{
cont<-cont+1
}
}
p_correct <-(cont/117)*100
p_correct
|
2. R simulation program
real_cluster1=matrix(NA,23,10)
for(i in 1:23)
{
real_cluster1[i,]=0.9046978+arima.sim(n = 10, list(ar = c(0.9046, -0.6988)),sd = 0.2041)
}
real_cluster2=matrix(NA,32,10)
for(i in 1:32)
{
real_cluster2[i,]=0.06502565+arima.sim(n = 10, list(ar = c(0.5153, -0.7188)),sd = 0.13816)
}
real_cluster3=matrix(NA,15,10)
for(i in 1:15)
{
real_cluster3[i,]=0.08059805+arima.sim(n = 10, list(ar = c(1.14375, -0.7949)), sd = 0.2414)
}
real_cluster4=matrix(NA,24,10)
for(i in 1:24)
{
real_cluster4[i,]=0.0595330+arima.sim(n = 10, list(ar = c(0.82451, -0.72953)), sd = 0.170921)
}
real_cluster5=matrix(NA,23,10)
for(i in 1:23)
{
real_cluster5[i,]=-0.10160+arima.sim(n = 10, list(ar = c(-0.08006, -0.6992)), sd = 0.12942)
}
data=data.frame(rbind(matrix(rep(1,23)),matrix(rep(2,32)),matrix(rep(3,15)),matrix(rep(4,24)),
matrix(rep(5,23))),seq(1,117),rbind(real_cluster1,real_cluster2,real_cluster3,real_cluster4,
real_cluster5))
colnames(data)=c("real_cluster","gene",noquote(paste(rep("time_",10), seq(0,135,15))))
write.table(data, "data1_new.txt", quote=FALSE, row.names=FALSE)
###K-means ###
kmeans1=data.frame(kmeans(data[,-(1:2)], 5, iter.max = 10000, nstart = 5)$cluster)
kmeans2=data.frame(cbind(matrix(rownames(kmeans1)),kmeans1))
colnames(kmeans2)=c("gene", "est_cluster")
kmeans3=merge(data[,1:2],kmeans2, by=intersect("gene","gene"))
dif_kmeans=(kmeans3$real_cluster - kmeans3$est_cluster)
correct_clust_kmeans=100*length(dif_kmeans[dif_kmeans==0])/nrow(data)
correct_clust_kmeans
####plot time series######
data1=matrix(t(data[,-(1:2)]),10*150,1)
gene1=sort(rep(data$gene,10))
cluster1=sort(rep(data$real_cluster,10))
x1=matrix(rep(seq(0,135,15),150))
data2=data.frame(cbind(cluster1, gene1, data1, x1))
colnames(data2)=c("real_cluster","gene","fc","time")
library(lattice)
par(mfrow=c(3,2))
xyplot(fc ~ time, data = data2[data2$real_cluster==1,], groups = gene, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
xyplot(fc ~ time, data = data2[data2$real_cluster==2,], groups = gene, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
xyplot(fc ~ time, data = data2[data2$real_cluster==3,], groups = gene, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
xyplot(fc ~ time, data = data2[data2$real_cluster==4,], groups = gene, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
xyplot(fc ~ time, data = data2[data2$real_cluster==5,], groups = gene, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
###plot average observed time series#####
fc1=matrix(colMeans(data[data$real_cluster==1,-(1:2)]))
fc2=matrix(colMeans(data[data$real_cluster==2,-(1:2)]))
fc3=matrix(colMeans(data[data$real_cluster==3,-(1:2)]))
fc4=matrix(colMeans(data[data$real_cluster==4,-(1:2)]))
fc5=matrix(colMeans(data[data$real_cluster==5,-(1:2)]))
data_avg=data.frame(cbind(rbind(fc1,fc2,fc3,fc4,fc5), rep(seq(0,135,15),5), sort(rep(seq(1,5),10))))
colnames(data_avg)=c("fc_avg","time","real_cluster")
xyplot(fc_avg ~ time, data = data_avg, groups = real_cluster, type="a",
auto.key = list(space = "right", points = FALSE, lines = TRUE))
|