-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDATA401-SQL-R-datamanipulation
More file actions
130 lines (93 loc) · 3.87 KB
/
DATA401-SQL-R-datamanipulation
File metadata and controls
130 lines (93 loc) · 3.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
## Adefoluke Shemsu
## Week 7 Assignment
# We will use occupational employment statistics data. The file is somewhat large (N > 430,000).
# For details, see http://www.bls.gov/oes/current/oes_stru.htm.
setwd("~/Documents/Education/Penn/Classes/DATA 401/Week 7") # Setting directory.
library(sqldf)
library(tidyverse)
# Peek at the first few rows of the dataset:
read.table("oesm.csv", sep = ";", header = TRUE, fill = TRUE, nrows = 10)
# 1. Begin SQL processing by creating a new database called 'conemp'
# Then add a table called 'oesm' (A best practice here is to use an
# if statement to delete a table called 'oesm' if one already exists.)
{
conemp <- dbConnect(SQLite(), dbname = "oesm.db")
if (dbExistsTable(conemp, "oesm"))
dbRemoveTable(conemp, "oesm")
dbWriteTable(
conemp,
"oesm",
"oesm.csv",
sep = ";",
header = TRUE,
row.names = FALSE
)
dbDisconnect(conemp)
} # Creating db.
# 2. Display the first 10 rows of all of the data in your table.
conemp.db <- dbConnect(SQLite(), dbname = "oesm.db") # Establishing db connection in order to run query.
conemp.q <- dbSendQuery(conemp.db, "
SELECT *
FROM oesm") # Running db query ('.q' = query for future reference).
fetch(conemp.q, n = 10)
dbClearResult(conemp.q)
# 3. Select a subset of columns, 'occtitle', 'h_mean', 'area_title' for
# all of the observations where the grouping variable is 'major.'
major.q <- dbSendQuery(conemp.db, "
SELECT occtitle, h_mean, area_title
FROM oesm
WHERE (grouping == 'major')") # Selecting subset.
fetch(major.q, n = 15) # Testing query.
dbClearResult(major.q)
# 4. Display all of the unique values of the variable 'naics'
uniqval.q <- dbSendQuery(conemp.db, "
SELECT DISTINCT naics
FROM oesm") # Displaying values.
fetch(uniqval.q, n = 15) # Testing query.
dbClearResult(uniqval.q)
# 5. Count all of the observations for each year in the dataset.
cntobs.q <- dbSendQuery(conemp.db, "
SELECT year, COUNT(*)
FROM oesm
GROUP BY year") # Counting observations.
fetch(cntobs.q, n = 15) # Testing query.
dbClearResult(cntobs.q)
# 6. Find minimum and maximum of median annual salary (a_mean) for
# each year
minmax.q <- dbSendQuery(conemp.db, "
SELECT MIN(a_mean)
FROM oesm
GROUP BY year") # Finding min.
fetch(minmax.q, n = -1) # Testing query.
dbClearResult(minmax.q)
#######################
minmax2.q <- dbSendQuery(conemp.db, "
SELECT MAX([a_mean])
FROM oesm
WHERE ([a_mean] IS NOT 'NA')
GROUP BY year") # Finding max.
fetch(minmax2.q, n = -1) # Testing query.
dbClearResult(minmax2.q)
# 7. Create a new table from the same database that includes
# the unique values of occcode and occtitle
if(dbExistsTable(conemp.db, "uniqtab")) dbRemoveTable(conemp.db, "uniqtab")
uniqtab.q <- dbSendQuery(conemp.db, "
CREATE TABLE uniqtab AS
SELECT DISTINCT occcode,occtitle
FROM oesm
")
dbClearResult(uniqtab.q)
# 8. Using your new SQL skills, pull some information from either table
# and create a visualization (of any kind) with your extracted data. In
# one or two sentences, briefly comment on your visual.
viz <- dbSendQuery(conemp.db, "
SELECT occtitle, a_mean
FROM oesm
WHERE (grouping == 'major')")
viz <- fetch(viz, n = 33756) # Turning 'major' subset into df to plot.
ggplot(viz,
aes(x = a_mean, y = occtitle)) + # Creating plot.
geom_point()+
xlab("Median Annual Salary")+
ylab("OCC Title")
# This visualization aggregates then compares the range of median annual salaries for each occuputation title.