-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathbuildRdfFile.py
More file actions
74 lines (65 loc) · 2.24 KB
/
buildRdfFile.py
File metadata and controls
74 lines (65 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import csv
import time
import datetime
import argparse
from rdflib import Graph
from rdflib.namespace import DC, SKOS
from rdflib import URIRef, Literal
import os
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--fileName', help='the CSV file of source data. \
optional - if not provided, the script will ask for input')
parser.add_argument('-d', '--directory', help='the directory for the input and \
output files. optional - if not provided, the script will assume null')
args = parser.parse_args()
if args.fileName:
fileName = args.fileName
else:
fileName = input('Enter the file name of the CSV of source data \
(including \'.csv\'): ')
if args.directory:
directory = args.directory
else:
directory = ''
os.chdir(directory)
startTime = time.time()
date = datetime.datetime.today().strftime('%Y-%m-%d')
nameUriDict = {}
# build graph
g = Graph()
g.bind('skos', SKOS)
g.bind('dc', DC)
# set uri starting point
uriNum = 1000
# parse csv data and add triples to graph
with open(fileName) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
altLabel = row['name']
prefLabel = row['authorizedName']
date = date
try:
subjectUri = nameUriDict[prefLabel]
if altLabel != prefLabel and altLabel != '':
g.add((URIRef(subjectUri), SKOS.altLabel, Literal(altLabel)))
except ValueError:
uriNum += 1
subjectUri = '[Base URI]' + str(uriNum)
g.add((URIRef(subjectUri), SKOS.prefLabel, Literal(prefLabel)))
if altLabel != prefLabel:
g.add((URIRef(subjectUri), SKOS.altLabel, Literal(altLabel)))
g.add((URIRef(subjectUri), DC.date, Literal(date)))
nameUriDict[prefLabel] = subjectUri
# create rdf file
g.serialize(format='n3', destination=open(fileName[:fileName.index('.')]
+ '.n3', 'w'))
print(g.serialize(format='n3'))
# extract all triples to csv
f = csv.writer(open('allTriples' + str(date) + '.csv', 'w'))
f.writerow(['subject'] + ['predicate'] + ['object'])
for s, p, o in g:
f.writerow([s] + [p] + [o])
elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
print('Total script run time: ', '%d:%02d:%02d' % (h, m, s))