What is Data Science?

Prof. Ernst C. Wit, Institute of Computing, Università della Svizzera italiana (wite@usi.ch)

In this lesson we will learn about Data Science, what it is, how it works and what can go wrong. We will be doing an analysis on Covid19 data in Switzerland.

Note that the code below can be used but the quote signs ” may cause trouble when copy-and-pasting (instead use the Analysis Code files above)

library(covid19swiss)
dates<-unique(covid19swiss$date)[-1]

prepare case data

sel<-covid19swiss$data_type==”cases_total”
totcases<-data.frame(t(tapply(covid19swiss$value[sel],INDEX=list(covid19swiss$location[sel],covid19swiss$date[sel]), FUN = function(x){x})))
totcases[1,]<-0
totcases <- apply(totcases,2,function(x){for (i in 1:length(x)){if (is.na(x[i])) {x[i]<-x[i-1]}};return(t(x))})
cases<-as.data.frame(apply(totcases,2,diff))

prepare death data

sel<-covid19swiss$data_type==”deaths_total”
totdeaths<-data.frame(t(tapply(covid19swiss$value[sel],INDEX=list(covid19swiss$location[sel],covid19swiss$date[sel]), FUN = function(x){x})))
totdeaths[1,]<-0
totdeaths <- apply(totdeaths,2,function(x){for (i in 1:length(x)){if (is.na(x[i])) {x[i]<-x[i-1]}};return(t(x))})
deaths<-as.data.frame(apply(totdeaths,2,diff))

1. How did pandemic develop in your canton?

plot raw data

plot(dates,cases$AG, type=”l”)
lines(smooth.spline(dates,y=cases$AG),col=”blue”)

plot(dates,deaths$AG, type=”l”)
lines(smooth.spline(dates,y=deaths$AG),col=”blue”)

plot SQRT transformed data

plot(dates,sqrt(cases$AG),type=”l”)
plot(dates,sqrt(deaths$AG),type=”l”)

put transformed data in matrix

lcases<-sqrt(as.matrix(cases))
colnames(lcases)<-colnames(cases)

2. How did cantons infect each other?

plot relationship between cantons

plot(lcases[,27],lcases[,1],xlab=”number of cases in ZH”,
ylab=”number of cases in AG”)

plot(lcases[,4],lcases[,1],xlab=”number of cases in BE”,
ylab=”number of cases in AG”)

Fit linear regressions

fit1<-lm(lcases[,1]~lcases[,27])
summary(fit1)
plot(lcases[,27],lcases[,1],xlab=”number of cases in ZH”,
ylab=”number of cases in AG”)
abline(coef(fit1),col=”blue”)

fit2<-lm(lcases[,1]~lcases[,4])
summary(fit2)
plot(lcases[,4],lcases[,1],xlab=”number of cases in BE”,
ylab=”number of cases in AG”)
abline(coef(fit2),col=”blue”)

Multiple regression

fit<-lm(lcases[,1]~lcases[,27]+lcases[,4])
summary(fit)

network of Swiss relationships of infections

library(huge)
cases.hg<-huge(lcases,method = “mb”)
cases.shg<-huge.select(cases.hg,criterion = “stars”)
plot(cases.shg)

plot infection network

library(igraph)
adj<-cases.hg$path[[cases.shg$opt.index]]
g<-graph_from_adjacency_matrix(adj,mode=”undirected”)
g<-set_vertex_attr(g,name=”label”, value=colnames(lcases))
plot(g,vertex.size=25,vertex.color=”white”, edge.width=2, curved=TRUE)

Swiss map with most infectious cantons

library(RSwissMaps)

colour each canton by number of links

dt <- can.template(2016)
dt$values <- rowSums(as.matrix(adj))[match(dt$name,colnames(lcases))]
can.plot(dt$bfs_nr, dt$values, 2016)

3. Did the mortality rate improve during the pandemic?

deal with recording issues

wkcases<-apply(cases,2,function(x){tapply(x,rep(1:33,each=7),sum)})
wkdeaths<-apply(deaths,2,function(x){tapply(x,rep(1:33,each=7),sum)})

plot(wkcases[,1],xlab=”Week”,ylab=”number Covid19 cases”,type=”l”)
plot(wkdeaths[,1],xlab=”Week”,ylab=”number Covid19 cases”,type=”l”)

raw mortality probability

plot(wkdeaths[,1]/wkcases[,1],type=”b”,ylab=”mortality rate”,xlab=”week”)
title(“No delay”)
plot(wkdeaths[-(1:2),1]/wkcases[-(32:33),1],type=”b”,ylab=”mortality rate”,xlab=”week”)
title(“Two week delay”)

logistic regression of mortality ratio

wks<-rep(1:31,27) y<-cbind(as.vector(wkdeaths[-c(1,2),]),as.vector(wkcases[-c(32,33),]-wkdeaths[-c(1,2),])) sel<-(apply(y,1,sum)>0)&(y[,2]>=0)
mortality.glm<-glm(y~wks,family=binomial(link = “logit”),subset = sel)
summary(mortality.glm)