Text Analysis: Written Questions and Answers in the UK Parliament

Nota Bene: This is only a preliminary analysis. As a next step, I’ll be matching MPs with their parliamentary constituencies and coupling their questions with survey data.

In this preliminary analysis, I will focus on the House of Commons and I will attempt to see if there are any differences among parties in terms of parliamentary questions. I will also check if the answering body’s response is affected by MPs’ party affiliation.
In 2019, 37396 questions were tabled by members of the House of Commons. Each page of the search results contains 20 questions and, luckily, all pages follow a specific URL pattern. This will make it much easier to automate the scraping process. I will be scraping the first 400 pages, bringing the total number of questions to 8000 asked between ~July and December 2019.
I created a function for each variable and a for loop that runs them on each question of every page and stores the data in one data frame.
Let’s scrape:

library("tidyverse")
library("rvest")
link = "https://questions-statements.parliament.uk/written-questions?Answered=Any&AnsweredFrom=&AnsweredTo=&DateFrom=01%2F01%2F2019&DateTo=31%2F12%2F2019&Expanded=True"
page = read_html(link)
minilinks = page %>% html_nodes(".overlay-link") %>% 
  html_attr("href") %>% paste("https://questions-statements.parliament.uk", ., sep ="")


get_qdate = function(minilink) {
  questionpage = read_html(minilink)
  questdate = questionpage %>% html_nodes(".reading-width p") %>% html_text()
}
qdate = sapply(minilinks, FUN = get_qdate)

get_mp = function(minilink) {
  questionpage = read_html(minilink)
  mp = questionpage %>% html_nodes(".primary-content .row:nth-child(1) .primary-info") %>% html_text()
}
mpname = sapply(minilinks, FUN = get_mp)

get_party = function(minilink) {
  questionpage = read_html(minilink)
  party = questionpage %>% html_nodes(".primary-content .row:nth-child(1) .tertiary-info") %>% html_text()
}
mpparty = sapply(minilinks, FUN = get_party)

get_dept = function(minilink) {
  questionpage = read_html(minilink)
  dept = questionpage %>% html_nodes(".secondary-content .card-inner .primary-info") %>% html_text()
}
rdept = sapply(minilinks, FUN = get_dept)

get_qst = function(minilink) {
  questionpage = read_html(minilink)
  qst = questionpage %>% html_nodes(".collapse-body") %>% html_text()
}
quest = sapply(minilinks, FUN = get_qst)

get_rsp = function(minilink) {
  questionpage = read_html(minilink)
  rsp = questionpage %>% html_nodes("p+ p") %>% html_text() %>% 
    paste(collapse = " ")
}
respon = sapply(minilinks, FUN = get_rsp)

questions = data.frame(mpname, mpparty, rdept, qdate, quest, respon, stringsAsFactors = FALSE)

for (page_result in seq(from = 2, to = 400, by = 1)) {
  link = paste("https://questions-statements.parliament.uk/written-questions?Answered=Any&AnsweredFrom=&AnsweredTo=&DateFrom=01%2F01%2F2019&DateTo=31%2F12%2F2019&Expanded=True&House=Commons&SearchTerm=&Page=", page_result, sep = "")
  page = read_html(link)
  minilinks = page %>% html_nodes(".overlay-link") %>% 
    html_attr("href") %>% paste("https://questions-statements.parliament.uk", ., sep ="")
  
  qdate = sapply(minilinks, FUN = get_qdate)
  mpname = sapply(minilinks, FUN = get_mp)
  mpparty = sapply(minilinks, FUN = get_party)
  rdept = sapply(minilinks, FUN = get_dept)
  quest = sapply(minilinks, FUN = get_qst)
  respon = sapply(minilinks, FUN = get_rsp)
  temp = data.frame(mpname, mpparty, rdept, qdate, quest, respon, stringsAsFactors = FALSE)
  questions = rbind(questions, temp)
}

(SpongeBob French narrator: “Three hours later”)
Now, I’m going to clean and organize the data. Apart from Tidytext’s stop_words, I need to create a custom stop words df. It will contain generic terms that are repeated across the questions such as the name of the answering body (I have that in a separate variable). I won’t be including the script here because it’s very, very long. Next, I will be plotting the number of questions per party.

questions = questions %>% mutate(id = row_number())
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
pid = questions %>% ct(mpparty)
pid %>% ggplot(., aes(x = reorder(mpparty, pct), y = pct, fill = mpparty)) +
  geom_col(color = 'black') + 
  coord_flip() +
  theme_minimal() +
  labs(title = "Written Questions Per Party", x = "Party", y = "Percentage", caption = "@Jihed Ncib") +
  scale_y_continuous(labels = percent) +
  theme(legend.position = "none") +
  theme(legend.title=element_blank()) +
  theme(text=element_text(size=18, family="saira")) +
  scale_fill_manual(values=cbPalette)

I thought it would be interesting to see if responses are affected by party affiliation (For example, if conservative MPs receive longer responses). But the figure below doesn’t show any major differences with the exception of a few conservative outliers who received longer responses.
(Stop words were included for this one)

rtkns = questions %>% unnest_tokens(word, respon)
rtkns = rtkns %>% filter(mpparty=="Labour" | mpparty=="Conservative")
reponse = rtkns %>% count(id, mpparty)
reponse = reponse %>% mutate(pid = ifelse(mpparty=="Labour",1,2))
reponse %>% ggplot(., aes(x = pid, y = n)) +
  geom_jitter(alpha = 0.20) + 
  scale_x_continuous(breaks = c(1,2), labels = c("1" = "Labour", "2" = "Conservative")) +
  theme_minimal() +
  labs(title= "Reponse Length Per Party", subtitle="Labour and Conservative", x = "", y="Length (Terms)", caption= "@Jihed Ncib") +
  theme(text=element_text(size = 16, family="saira"))
questtkns = questions %>% unnest_tokens(word, quest)
questtkns = questtkns %>% anti_join(stop_words)
questtkns = questtkns %>% anti_join(custom)
qtkns = questtkns %>% group_by(mpparty) %>% count(word) %>% arrange(desc(n))
aa1 = qtkns %>% filter(mpparty=="Labour") %>% top_n(20,n) %>% 
  ggplot(., aes(x = reorder(word, n), y = n)) +
  geom_col(color = 'black', fill = "#009E73") +
  coord_flip() + 
  theme_minimal() +
  labs(title = "Written Questions - Term Frequency", subtitle = "Labour", x = "Term", y = "Frequency") +
  theme(text=element_text(size = 16, family="saira"))
aa2 = qtkns %>% filter(mpparty=="Conservative") %>% top_n(20,n) %>% 
  ggplot(., aes(x = reorder(word, n), y = n)) +
  geom_col(color = 'black', fill = "#D55E00") +
  coord_flip() + 
  theme_minimal() +
  labs(title = "Written Questions - Term Frequency", subtitle = "Conservative", x = "Term", y = "Frequency") +
  theme(text=element_text(size = 16, family="saira"))
library(patchwork)
aa1 + aa2
cbPalette1 <- c("#56B4E9", "#009E73")
dept = questions %>% filter(mpparty=="Labour" | mpparty=="Conservative") %>% group_by(mpparty) %>% count(rdept)
dept %>% ggplot(., aes(x = reorder(rdept, n), y = n, fill = mpparty)) +
  geom_col(color = 'black') + 
  coord_flip() +
  theme_minimal() +
  labs(title = "Written Questions Per Addressee", subtitle = "Labour and Conservative", x = "Department", y = "Number of Questions", caption = "@Jihed Ncib") +
  theme(legend.title=element_blank()) +
  theme(legend.position = "bottom") +
  theme(text=element_text(size = 18, family="saira")) +
  scale_fill_manual(values=cbPalette1)