scope_wsc_2025/2_clean_data.R at main · pythonhealthdatascience/scope_wsc_2025 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Analysis of final set

library(tidyverse)
library(stringi)
library(data.table)

# 1. Clean dataset, clean authors, save as file 6
# Read in file 5: following full text read and data extraction

fullset <- read.csv("datasets/5_fullset_analysis.csv") %>%
  filter(ACCEPT == 1) %>%
  select(-reason)

################################################################################

# Prepare data for DES and hybrid counts per year
line_data <- fullset %>%
  group_by(year) %>%
  summarise(
    DES_count = sum(DES, na.rm = TRUE),
    hybrid_count = sum(hybrid, na.rm = TRUE)
  )


###################################
# Extend authors - exploratory investigation of author relationships

fullset$authors_clean <- fullset$authors %>%
  # Replace special characters (e.g., accented letters) with ASCII equivalents
  stri_trans_general("Latin-ASCII") %>%
  # Split authors by ";" to separate individual authors
  str_split(";") %>%
  # For each author list, extract surnames (the first token before the first comma or space)
  map_chr(~paste(trimws(sapply(.x, function(author) {
    # Remove leading/trailing whitespace
    author <- trimws(author)
    # Extract surname (before comma or first whitespace)
    surname <- str_split(author, ",|\\s")[[1]][1]
    return(surname)
  })), collapse = "; "))

# View cleaned results
fullset %>% select(authors, authors_clean) %>% head()

write.csv(fullset, "datasets/6_fullset_authors.csv")

##################################################################

# Manual update of authors list on file 6 checking for errors
# and save as file 7

##################################################################

# read in file 7

fullset <- read.csv("datasets/7_fullset_authors.csv")

# count authors
top_authors <- fullset %>%
  mutate(authors_clean = strsplit(as.character(authors_clean), "; ")) %>% # Ensure authors_clean is a character vector
  unnest(authors_clean) # Expand list into rows

top_authors <- top_authors %>%
  count(authors_clean) %>%
  filter(n>1)

###############################################################