Predicting the number of Covid19 infections by using an LSTM

Image credit: www.pixabay.com

Data is extracted from data.public.lu.

Data import

# Data import
covid_data = read.csv2("https://data.public.lu/fr/datasets/r/767f8091-0591-4b04-9a6f-a9d60cd57159",
                       sep=",",na.strings="-", fileEncoding = "Latin1")

Data preparation

# Replace NA values by 0
covid_data[is.na(covid_data)] = 0
# Save data.frame as tibble
covid = as_tibble(covid_data)

# Change variables names
covid <- covid %>% 
  rename(date = Date,
         hospital = Soins.normaux,
         int_care = Soins.intensifs,
         deaths = X.1.NbMorts.,
         left_hospital = Sorties.hôpital,
         inf_cum = Nb.de.positifs.cumulé,
         infections = Nb.de.positifs,tests = Nb.de.tests.effectués,
         tests_cum = Nb.de.tests.effectués.cumulés) %>%
  select(-Soins.intensifs.1) %>%
  mutate(date = as.Date(1:length(date), origin="2020-02-23"))

LSTM

Data preparation

Y = covid$infections
N=length(Y)
step = 7
Y = c(Y, replicate(step, head(Y, 1)))

x = NULL
y = NULL
for(i in 1:N)
{
  s = i-1+step
  x = rbind(x,Y[i:s])
  y = rbind(y,Y[s+1])
}
  
X = array(x, dim=c(N, step,1))

Model

model = keras_model_sequential() %>%   
   layer_lstm(units=64, input_shape=c(step, 1), activation="relu", return_sequences = TRUE,
              kernel_regularizer = regularizer_l2()) %>% 
   layer_dropout(0.05) %>%
   layer_lstm(units=32, activation = "relu") %>%
   layer_dropout(0.05) %>%
   layer_dense(units=16) %>%  
   layer_dense(units=1, activation = "relu")
 
model %>% compile(loss = 'mse',
                  optimizer = 'adam',
                  metrics = list("mean_absolute_error")
                   )
 
model %>% summary()
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## lstm_1 (LSTM)                       (None, 7, 64)                   16896       
## ________________________________________________________________________________
## dropout_1 (Dropout)                 (None, 7, 64)                   0           
## ________________________________________________________________________________
## lstm (LSTM)                         (None, 32)                      12416       
## ________________________________________________________________________________
## dropout (Dropout)                   (None, 32)                      0           
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 16)                      528         
## ________________________________________________________________________________
## dense (Dense)                       (None, 1)                       17          
## ================================================================================
## Total params: 29,857
## Trainable params: 29,857
## Non-trainable params: 0
## ________________________________________________________________________________

Model training

history <- model %>% fit(X,y, epochs=100, batch_size=32, shuffle = FALSE, validation_split=0.5, verbose=FALSE)
plot(history)
## `geom_smooth()` using formula 'y ~ x'

Model evaluation

y_pred = model %>% predict(X)
 
scores = model %>% evaluate(X, y, verbose = 0)
print(scores)
##                loss mean_absolute_error 
##         12349.11816            56.26086
x_axes = seq(1:length(y_pred))
plot(x_axes, movavg(y,7,"s"), type="l", col="red", lwd=2)
lines(x_axes, movavg(y_pred,7,"s"), col="blue",lwd=2)
legend("topleft", legend=c("Original data", "Predicted data"),
        col=c("red", "blue"), lty=1,cex=0.8)

Prediction

prediction_days = 90 # number of future days to predict
prev_days = Y[(length(covid$infections)-step + 1) : length(covid$infections)]
predictions = c() # predictions for predicitions_days next days
for (day in 1:prediction_days){
  prediction = model %>% predict(array(prev_days, dim=c(1, step, 1)))
  predictions = c(predictions, prediction)
  prev_days = c(prev_days, prediction)[2:(step+1)]
}

# Plot
ggplot(data = data.frame(Date = c(covid$date,max(covid$date)+seq(1,prediction_days)),
                         Infections = movavg(c(covid$infections,predictions),7,"s"),
                         col = c(rep("Historical data",length(covid$date)),rep("Predictions",prediction_days)))) +
  geom_line(mapping=aes(x=Date,y=Infections, colour=factor(col))) +
  theme(legend.title=element_blank()) +
  ggtitle("7-day moving average of daily Covid19 infections in Luxembourg")

As the current model is only based on historical infection data, I will shortly try to update the model by adding supplementary variables such as meteorological data, presence of sanitary measures etc.

Fred Philippy
Fred Philippy
Master’s degree student in Statistics

I am a 2nd-year Master’s degree student in Statistics with a particular interest in Machine Learning, Computational Statistics, Data Analysis and High-Dimensional Statistics.

Related