NLP with Chat GPT

Demonstration using PAC data
large language models (LLM)
evidence synthesis
methods
notes
Author

Luke Heley

Published

January 27, 2024

Code
# Load Lines and Split YAML and MD
lines <- readLines("pac.md")
yaml_index <- (stringr::str_detect(lines, "---") |>
  which())[1:2]
yaml <- lines[yaml_index[1]:yaml_index[2]]
md <- lines[(yaml_index[2]+1):length(lines)]
Code
# Extract Questions and Individual
df <- dplyr::tibble(text = md) |>
  dplyr::summarise(text = paste(text, collapse = "\n")) |>
  dplyr::mutate(text = stringr::str_split(text, "\n\n")) |>
  tidyr::unnest(text) |>
  dplyr::slice(8:457) |>
  dplyr::mutate(paragraph_id = 1:dplyr::n()) |>
  dplyr::mutate(question = stringr::str_extract(text, "^Q[1-9] |^Q[1-9][0-9] |Q[1-9][0-9]{2} ")) |>
  dplyr::mutate(person = stringr::str_locate(text, ": ")[,"start"]) |>
  dplyr::mutate(person = substr(text, 1, person-1)) |>
  dplyr::mutate(text = ifelse(!is.na(person), stringr::str_remove(text, paste0(person, ": ")), text)) |>
  tidyr::fill(question, person) |>
  dplyr::mutate(person = trimws(stringr::str_remove(person, question))) |>
  dplyr::mutate(question = trimws(question))

df |>
  dplyr::filter(text == "")
# A tibble: 0 × 4
# ℹ 4 variables: text <chr>, paragraph_id <int>, question <chr>, person <chr>
Code
# Setup Open AI
import os
from openai import OpenAI
from tqdm import tqdm
n = len(r.df["text"])
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)
Code
# Extract the keywords
keywords = []
for i in tqdm(range(0, n)):
  para = r.df["text"][i]
  chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
             "content": f"Label the topics being discussed in the following paragraph using words or a short phrase, separate the topic labels with a comma:\n {para}\n",
        }
    ],
    model="gpt-3.5-turbo",
    )
  keywords.append(chat_completion.choices[0].message.content)
Code
sentiment = []
for i in tqdm(range(0, n)):
  para = r.df["text"][i]
  chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
             "content": f":Rate the sentiment expressed in the following paragraph on a scale of 1 to 10 where 1 is very negative and 10 is very positive:\n {para}\n",
        }
    ],
    model="gpt-3.5-turbo",
    )
  sentiment.append(chat_completion.choices[0].message.content)
Code
df = r.df
df["keywords"] = keywords
df["sentiment"] = sentiment
df.to_csv("out.csv", index=False)
Code
library(ggplot2)
df <- readr::read_csv("out.csv")
Rows: 450 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): text, question, person, keywords, sentiment
dbl (1): paragraph_id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
na <- c("I'm sorry, but without any specific sentiment mentioned in the paragraph, it is not possible to rate it on a scale of 1 to 10. The paragraph simply asks for clarification regarding the timeframe of something, and does not express any positive or negative sentiment.",    
"As an AI language model, I do not possess personal opinions or emotions. Therefore, I am unable to rate the sentiment expressed in the given statement." ,
"Without any additional context, it is difficult to accurately rate the sentiment expressed in the given statement.",
"Without any context provided, it is impossible to determine the sentiment expressed in the given phrase.",
"Since the given statement is an incomplete sentence, it is difficult to determine the exact sentiment being expressed. Therefore, it is not possible to rate the sentiment on a scale of 1 to 10.",
"It is difficult to accurately rate the sentiment expressed in this paragraph since it is incomplete and lacks emotional context. Without further information, it is not possible to determine whether the sentiment is positive or negative.",
"I'm sorry, but the given sentence \"It is 2032.\" does not convey any sentiment, as it is a neutral statement indicating a specific year. Therefore, it would not be accurate to rate it on a scale of 1 to 10.", 
"Without the complete sentence, it is difficult to accurately rate the sentiment expressed in the paragraph. Please provide the full sentence for a more accurate assessment.",
"I am sorry, but the given phrase does not convey any specific sentiment. It appears to be incomplete and lacks context.",
"There is no clear sentiment expressed in the sentence \"Yes.\" Therefore, it is not possible to rate its sentiment on a scale from 1 to 10."
)    


df <- df |> 
  dplyr::mutate(sentiment_score = as.numeric(stringr::str_extract(sentiment, "[1-9].5|[1-9]|10"))) |>
  dplyr::mutate(sentiment_score = ifelse(sentiment %in% na, 5, sentiment_score)) |> 
  dplyr::mutate(question = as.numeric(stringr::str_remove(question, "Q"))) 

ggplot(df) + geom_line(aes(paragraph_id, sentiment_score))

Code
ggplot(df) + geom_boxplot(aes(person, sentiment_score)) + coord_flip()
Warning: Removed 4 rows containing non-finite values (`stat_boxplot()`).

Code
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
Code
df$split_text <- purrr::map_chr(df$text, ~{
  # Locate Spaces
  spaces <- stringr::str_locate_all(.x, "\\s")[[1]][,"start"]
  
  # Locate Space Closest to Every N Characters
  n <- 100
  if(max(spaces)<n) {
    from <- 1
    to <- nchar(.x)
  } else{
    splits <- purrr::map_int(seq(n, nchar(.x), by = n), ~{
      spaces[which(abs(spaces-.x) == min(abs(spaces-.x)))][1]
    })
    from <- head(c(1, splits, nchar(.x)), -1)
    to <- tail(c(1, splits, nchar(.x)), -1)
  }
  
  new_text <- character()
  for(j in seq_along(from)){
    new_text <- paste0(new_text, paste0(substr(.x, from[j], to[j])), "<br>")
  }
  new_text
  

})
Warning in max(spaces): no non-missing arguments to max; returning -Inf

Warning in max(spaces): no non-missing arguments to max; returning -Inf

Warning in max(spaces): no non-missing arguments to max; returning -Inf

Warning in max(spaces): no non-missing arguments to max; returning -Inf

Warning in max(spaces): no non-missing arguments to max; returning -Inf
Code
plot_ly(df, x = ~question, 
        y = ~factor(person,rev(unique(person))), 
        z = ~sentiment_score, type = "heatmap",
        text = ~split_text,
        hovertemplate = "%{text}<extra></extra>") |>
  layout(yaxis = list(title = "Person"),
         xaxis = list(title = "Question")) 
Code
question_text2 <- df |> 
  dplyr::group_by(question) |> 
  dplyr::group_split() |>
  purrr::map(~{
    dplyr::tibble(
      question = .x$question[1], 
      text = paste(paste(paste0(.x$person,":"), .x$text), collapse = "\n")
    )
    }) |>
  dplyr::bind_rows() 
Code
# Extract the keywords
topics = []
n = len(r.question_text2["text"])
for i in tqdm(range(0, n)):
  para = r.question_text["text"][i]
  chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
             "content": f"Label the topic being discussed in the given context using only a single word or very short phrase. Context:\n {para}\n",
        }
    ],
    model="gpt-3.5-turbo",
    )
  topics.append(chat_completion.choices[0].message.content)
Code
library(reticulate)

q_sent_topic <- df |> 
  dplyr::group_by(question) |> 
  dplyr::group_split() |>
  purrr::map(~{
    dplyr::tibble(
      question = .x$question[1], 
      text = paste(paste(paste0(.x$person,":"), .x$text), collapse = "\n"),
      avg_sentiment = mean(.x$sentiment_score, na.rm = TRUE)
    )
    }) |>
  dplyr::bind_rows() |>
  dplyr::mutate(topic = py$topics) 

write.csv(q_sent_topic, "q_sent_topic.csv", row.names = FALSE)
Code
q_sent_topic <- readr::read_csv("q_sent_topic.csv")
Rows: 138 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): text, topic
dbl (2): question, avg_sentiment

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
q_sent_topic |>
  dplyr::mutate(topic = paste0("Q", question, " ", topic)) |>
  dplyr::select(topic, question, text, avg_sentiment) |>
  dplyr::mutate(topic = factor(topic, unique(topic))) |>
  DT::datatable(filter = "top", rownames = FALSE)
Code
pdf <- q_sent_topic |>
  dplyr::mutate(id = paste(question, topic))  |>
  dplyr::mutate(avg_sentiment = avg_sentiment-5) |>
  dplyr::mutate(positive = avg_sentiment>0)

pdf |>
  dplyr::slice(1:50) |>
  ggplot() + geom_bar(aes(factor(id,rev(unique(id))), avg_sentiment, fill = positive), stat = "identity") + 
  coord_flip() +
  scale_y_continuous(breaks = -4:4,
                     limits = c(-4, 4))

Code
pdf |>
  dplyr::slice(51:100) |>
  ggplot() + geom_bar(aes(factor(id,rev(unique(id))), avg_sentiment, fill = positive), stat = "identity") + 
  coord_flip() +
  scale_y_continuous(breaks = -4:4,
                     limits = c(-4, 4))

Code
pdf |>
  dplyr::slice(101:150) |>
  ggplot() + geom_bar(aes(factor(id,rev(unique(id))), avg_sentiment, fill = positive), stat = "identity") + 
  coord_flip() +
  scale_y_continuous(breaks = -4:4,
                     limits = c(-4, 4))