-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path2_clean_data.R
More file actions
68 lines (49 loc) · 2.03 KB
/
2_clean_data.R
File metadata and controls
68 lines (49 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Analysis of final set
library(tidyverse)
library(stringi)
library(data.table)
# 1. Clean dataset, clean authors, save as file 6
# Read in file 5: following full text read and data extraction
fullset <- read.csv("datasets/5_fullset_analysis.csv") %>%
filter(ACCEPT == 1) %>%
select(-reason)
################################################################################
# Prepare data for DES and hybrid counts per year
line_data <- fullset %>%
group_by(year) %>%
summarise(
DES_count = sum(DES, na.rm = TRUE),
hybrid_count = sum(hybrid, na.rm = TRUE)
)
###################################
# Extend authors - exploratory investigation of author relationships
fullset$authors_clean <- fullset$authors %>%
# Replace special characters (e.g., accented letters) with ASCII equivalents
stri_trans_general("Latin-ASCII") %>%
# Split authors by ";" to separate individual authors
str_split(";") %>%
# For each author list, extract surnames (the first token before the first comma or space)
map_chr(~paste(trimws(sapply(.x, function(author) {
# Remove leading/trailing whitespace
author <- trimws(author)
# Extract surname (before comma or first whitespace)
surname <- str_split(author, ",|\\s")[[1]][1]
return(surname)
})), collapse = "; "))
# View cleaned results
fullset %>% select(authors, authors_clean) %>% head()
write.csv(fullset, "datasets/6_fullset_authors.csv")
##################################################################
# Manual update of authors list on file 6 checking for errors
# and save as file 7
##################################################################
# read in file 7
fullset <- read.csv("datasets/7_fullset_authors.csv")
# count authors
top_authors <- fullset %>%
mutate(authors_clean = strsplit(as.character(authors_clean), "; ")) %>% # Ensure authors_clean is a character vector
unnest(authors_clean) # Expand list into rows
top_authors <- top_authors %>%
count(authors_clean) %>%
filter(n>1)
###############################################################