In this lesson we will learn about Data Science, what it is, how it works and what can go wrong. We will be doing an analysis on Covid19 data in Switzerland.
Note that the code below can be used but the quote signs ” may cause trouble when copy-and-pasting (instead use the Analysis Code files above)
library(covid19swiss)
dates<-unique(covid19swiss$date)[-1]
prepare case data
sel<-covid19swiss$data_type==”cases_total”
totcases<-data.frame(t(tapply(covid19swiss$value[sel],INDEX=list(covid19swiss$location[sel],covid19swiss$date[sel]), FUN = function(x){x})))
totcases[1,]<-0
totcases <- apply(totcases,2,function(x){for (i in 1:length(x)){if (is.na(x[i])) {x[i]<-x[i-1]}};return(t(x))})
cases<-as.data.frame(apply(totcases,2,diff))
prepare death data
sel<-covid19swiss$data_type==”deaths_total”
totdeaths<-data.frame(t(tapply(covid19swiss$value[sel],INDEX=list(covid19swiss$location[sel],covid19swiss$date[sel]), FUN = function(x){x})))
totdeaths[1,]<-0
totdeaths <- apply(totdeaths,2,function(x){for (i in 1:length(x)){if (is.na(x[i])) {x[i]<-x[i-1]}};return(t(x))})
deaths<-as.data.frame(apply(totdeaths,2,diff))
1. How did pandemic develop in your canton?
plot raw data
plot(dates,cases$AG, type=”l”)
lines(smooth.spline(dates,y=cases$AG),col=”blue”)
plot(dates,deaths$AG, type=”l”)
lines(smooth.spline(dates,y=deaths$AG),col=”blue”)
plot SQRT transformed data
plot(dates,sqrt(cases$AG),type=”l”)
plot(dates,sqrt(deaths$AG),type=”l”)
put transformed data in matrix
lcases<-sqrt(as.matrix(cases))
colnames(lcases)<-colnames(cases)
2. How did cantons infect each other?
plot relationship between cantons
plot(lcases[,27],lcases[,1],xlab=”number of cases in ZH”,
ylab=”number of cases in AG”)
plot(lcases[,4],lcases[,1],xlab=”number of cases in BE”,
ylab=”number of cases in AG”)
Fit linear regressions
fit1<-lm(lcases[,1]~lcases[,27])
summary(fit1)
plot(lcases[,27],lcases[,1],xlab=”number of cases in ZH”,
ylab=”number of cases in AG”)
abline(coef(fit1),col=”blue”)
fit2<-lm(lcases[,1]~lcases[,4])
summary(fit2)
plot(lcases[,4],lcases[,1],xlab=”number of cases in BE”,
ylab=”number of cases in AG”)
abline(coef(fit2),col=”blue”)
Multiple regression
fit<-lm(lcases[,1]~lcases[,27]+lcases[,4])
summary(fit)
network of Swiss relationships of infections
library(huge)
cases.hg<-huge(lcases,method = “mb”)
cases.shg<-huge.select(cases.hg,criterion = “stars”)
plot(cases.shg)
plot infection network
library(igraph)
adj<-cases.hg$path[[cases.shg$opt.index]]
g<-graph_from_adjacency_matrix(adj,mode=”undirected”)
g<-set_vertex_attr(g,name=”label”, value=colnames(lcases))
plot(g,vertex.size=25,vertex.color=”white”, edge.width=2, curved=TRUE)
Swiss map with most infectious cantons
library(RSwissMaps)
colour each canton by number of links
dt <- can.template(2016)
dt$values <- rowSums(as.matrix(adj))[match(dt$name,colnames(lcases))]
can.plot(dt$bfs_nr, dt$values, 2016)
3. Did the mortality rate improve during the pandemic?
deal with recording issues
wkcases<-apply(cases,2,function(x){tapply(x,rep(1:33,each=7),sum)})
wkdeaths<-apply(deaths,2,function(x){tapply(x,rep(1:33,each=7),sum)})
plot(wkcases[,1],xlab=”Week”,ylab=”number Covid19 cases”,type=”l”)
plot(wkdeaths[,1],xlab=”Week”,ylab=”number Covid19 cases”,type=”l”)
raw mortality probability
plot(wkdeaths[,1]/wkcases[,1],type=”b”,ylab=”mortality rate”,xlab=”week”)
title(“No delay”)
plot(wkdeaths[-(1:2),1]/wkcases[-(32:33),1],type=”b”,ylab=”mortality rate”,xlab=”week”)
title(“Two week delay”)
logistic regression of mortality ratio
wks<-rep(1:31,27) y<-cbind(as.vector(wkdeaths[-c(1,2),]),as.vector(wkcases[-c(32,33),]-wkdeaths[-c(1,2),])) sel<-(apply(y,1,sum)>0)&(y[,2]>=0)
mortality.glm<-glm(y~wks,family=binomial(link = “logit”),subset = sel)
summary(mortality.glm)