library(tidyverse)
library(data.table)
library(here)
library(kableExtra)
library(tidytext)
library(DT)
data_path <- "C:/Users/goutsmedt/Documents/MEGAsync/Research/R/projets/data/green_ecb_responsiveness"
source(here(path.expand("~"), "green_ecb", "function", "functions_for_topic_modelling.R"))
K = 120
# load the topics stats and gamma attributes
lda <- readRDS(here(data_path, "topic_modelling", paste0("LDA_", K, ".rds")))
lda_data <- readRDS(here(data_path,
"topic_modelling",
paste0("LDA_", K, "_data.rds"))) %>%
as.data.table() %>%
.[, period := case_when(between(date, "1998-11-20", "2011-11-08") ~ "Period_1",
between(date, "2011-11-08", "2021-09-01") ~ "Period_2",
between(date, "2021-09-01", "2023-02-01") ~ "Period_3")]
lda_proximity <- readRDS(here(data_path,
"topic_modelling",
"similarities_LDA.rds"))
topics_to_look <- lda_proximity %>%
filter(rank <= 5 | topic %in% c(114, 21, 91, 103),
! topic %in% c(78, 76, 115, 43, 74, 109, 56, 61)) %>%
distinct(topic)
inflation_topics <- lda_data %>%
filter(inflation_topic == TRUE) %>%
distinct(topic)
topics_to_look <- bind_rows(inflation_topics, topics_to_look) %>%
mutate(rank = row_number())
data_year_subset <- lda_data %>%
filter(! is.na(period)) %>%
.[,`:=` (mean = mean(gamma),
st_err = sd(gamma)/sqrt(length(gamma))), by = .(topic, period)] %>%
.[order(period, desc(mean)),] %>%
distinct(topic, topic_name, inflation_topic, period, mean, st_err) %>%
.[, rank := 1:.N, by = period] %>%
pivot_wider(names_from = "period", values_from = c("mean", "st_err", "rank"))
topics_per_speech <- lda_data %>%
.[, gamma_speech := mean(gamma), by = .(topic, file)] %>%
select(topic, file, title, year, date, speaker_cleaned, gamma_speech, pdf_link, period) %>%
unique()
# Calculate top frex and lift value for the topic
beta_lda <- tidy(lda, matrix = "beta") %>%
group_by(topic) %>%
slice_max(order_by = beta, n = 15, with_ties = FALSE) %>%
mutate(rank_beta = 1:n()) %>%
select(topic, term_beta = term, rank_beta, beta)
frex_lda <- calculate_frex(lda, 15, 0.5, topic_method = "LDA") %>%
group_by(topic) %>%
slice_max(order_by = frex, n = 15, with_ties = FALSE) %>%
ungroup() %>%
select(term_frex = term, rank_frex = rank, frex)
lda_words <- beta_lda %>%
bind_cols(frex_lda)
# Most representative speech
top_speech_paragraphs <- lda_data %>%
select(topic, document_id, title, date, speaker_cleaned, period, pdf_link, paragraphs, gamma) %>%
group_by(topic) %>%
slice_max(gamma, n = 10, with_ties = FALSE) %>%
mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
paragraphs = str_trunc(paragraphs, 800, "right") %>% str_squish(),
gamma = round(gamma, 3)) %>%
ungroup()
top_speech <- topics_per_speech %>%
select(topic, file, title, date, speaker_cleaned, period, pdf_link, gamma_speech) %>%
group_by(topic) %>%
slice_max(gamma_speech, n = 15, with_ties = FALSE) %>%
mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
gamma_speech = round(gamma_speech, 3)) %>%
ungroup()
# Most representative speech per period
top_speech_paragraphs_period <- lda_data %>%
select(topic, document_id, title, date, speaker_cleaned, period, pdf_link, paragraphs, period, gamma) %>%
filter(! is.na(period)) %>%
group_by(period, topic) %>%
slice_max(gamma, n = 5, with_ties = FALSE) %>%
mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
paragraphs = str_trunc(paragraphs, 800, "right") %>% str_squish(),
gamma = round(gamma, 3)) %>%
ungroup()
top_speech_period <- topics_per_speech %>%
select(topic, file, title, date, speaker_cleaned, period, pdf_link, gamma_speech, period) %>%
filter(! is.na(period)) %>%
group_by(period, topic) %>%
slice_max(gamma_speech, n = 5, with_ties = FALSE) %>%
mutate(title_link = paste0("[", title, "](", pdf_link, ")"),
gamma_speech = round(gamma_speech, 3)) %>%
ungroup()
# ordering topics
list_topics <- data_year_subset %>%
mutate(prevalence = mean_Period_1 + mean_Period_2 + mean_Period_3) %>%
filter(topic %in% topics_to_look$topic) %>%
arrange(desc(inflation_topic), desc(prevalence)) %>%
mutate(topic_name = paste0("Topic ", topic, ": ", topic_name),
rank = row_number())
Our article seeks to understand the transformations in ECB’s framing of inflation issue between 1998 and early 2023. To observe these transformations, we use topic modeling on a corpus of ECB policymakers’ speeches.
Topic modeling is a method used to uncover hidden themes (the topics) in a large corpus of text data. It is an “unsupervised” method that automatically identifies structures and categories in an unstructured corpus. This technical appendix provides all the details on our implementation and use of this method.
Our corpus is compose of the speeches of ECB’s board members between 1998-11-20 and 2023-02-01 listed on the Bank of International Settlements website. We have used tesseract (Ooms 2022) for Optical Character Recognition on some speeches for which recognition was not good. Thanks notably to R packages Tidytext (Silge and Robinson 2016) and tokenizers (Mullen et al. 2018), we divide each speech in a sequence of paragraphs. We remove bibliography paragraphs as well as paragraph with acknowledgements.
We want to focus on speeches that deal substantially with inflation. We thus decide to keep only the speeches that mention a certain number of times words containing “inflation” (“inflation”, “inflationary”, “disinflation” etc.). To take into account the length of a speech, we divide the frequency of “inflation” words by the speech number of pages. We test three thresholds under which we remove a speech:
Here is the number of speeches in the corpus depending on the threshold we use:
knitr::include_graphics(here::here("pictures", glue::glue("threshold_corpus_absolute.png")))
We see that it leads to remove a lot of speeches between 2009 and 2013, a sign that inflation was a less important issue at this time. But the differences between the different thresholds are not very large in average. We decide to take the more restrictive threshold (2 * number of pages), which gives us 817 speeches. This choice gives us a sufficiently large corpus to have a representative sample of speeches on inflation, while avoiding to integrate speeches in which inflation is not so central.
Texts are tokenized with tidytext (Silge and Robinson 2016) and tokenizers (Mullen et al. 2018) packages. We keep unigrams (like “price”), bigrams (“price stability”) and trigrams (“maintain price stability”). The corpus is organised in paragraphs: the documents in the topic modelling are the 6818 paragraphs of the 817 speeches. This allows (i) for a more fine-grained understanding of what the whole speech is about as well as (ii) to measure more accurately correlation between topics, at the paragraph level.
We remove the words in the stopwords lists “nltk” and “iso” implemented in R packages stopwords (Benoit, Muhr, and Watanabe 2021). These are large list of stopwords, allowing us to remove unnecessary words in our analysis.1 We lemmatize each word using the dictionary incorporated in textstem (Rinker 2018).
To run our topic model, we use the usual Latent Dirichlet Allocation (LDA), which is a probabilistic generative model employed in machine learning to detect topics present in a collection of a document. It presupposes that each document comprises a blend of a limited number of concealed topics, with each word in the document generated by one of those topics.
The LDA model characterizes each document as a distribution of topics, wherein each topic is a distribution over words. It uses Bayesian inference to estimate the probability distribution of topics and words within each document, as well as the overall distribution of topics in the entire collection.
The algorithm encompasses three steps:
We use the LDA model as it has been proved useful in many research (Boyd-Graber, Hu, and Mimno 2017; Nelson 2020; Macanovic 2022). The structural topic model [STM; Roberts et al. (2013)] has been often used by political scientists in the recent years. It allows to “incorporate structure” in the model (Grimmer, Roberts, and Stewart 2022, 244) by integrating additional categories attributed to the document (e.g., the writer, the audience, the year, etc…), and observe, notably, how topics prevalence varies depending on these categories. However, the only relevant metadata in our case was the date of speeches, and the LDA allows to take into account conveniently the variation of topics prevalence over time. We thus felt that it was not needed to go for another type of model.
Denny and Spirling (2018) have shown that the choice of pre-processing steps (lemmatization or stemming, choosing stop words to remove, filtering rare words, etc.) and their order may have a large impact on the results of topic modelling. One risk according to Denny and Spirling (2018) is to choose the results that correspond the most to what the authors want to say. We took care of Denny and Spirling’s point in three ways.
First, we adopted specific pre-processing choices and we stuck to them (lemmatisation, large list of stop words, keeping also bigrams and trigrams; see section 3.1). Consequently, we were not able to manipulate these pre-processing steps to obtain the results we wanted.
Second, we have tested different preprocessing regarding one crucial choice: the filtering of rare words. We have tested three different thresholds, by removing all the ngrams which appear less than 5, 10 or 20 times. This results in a different vocabulary. We run our LDA model for each of these vocabulary lists.
We also test different models for different number of topics, from 30 topics to 160, going 10 by 10. For each of our three vocabulary filters, we thus run 14 models with a different number of topics, for a total of 42 models. We use both quantitative and qualitative approaches to choose our filtering threshold and the number of topics.
First, we perform 4 quantitative metrics implemented in Nikita (2020) for our different models. Two metrics inspired by Arun et al. (2010) and Cao et al. (2009) has to be minimized; the two others, inspired by Griffiths and Steyvers (2004) and Deveaud, SanJuan, and Bellot (2014) has to be maximized. Here is an interactive figure to observe the results for the different pre-processing methods (method 1 for more than 5 occurrences; method 2 for more than 10; and method 3 for more than 20). The crosses indicate the maximized and minimized values.
htmltools::includeHTML(here::here("writing", "tuning_topicmodels.html"))