-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathactivity_data.py
More file actions
116 lines (100 loc) · 4.51 KB
/
activity_data.py
File metadata and controls
116 lines (100 loc) · 4.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Import some constants
import json
from datetime import datetime, timedelta
import csv
import pandas as pd
# Get data files
watch_round1_session1 = pd.read_csv('watch_round1_session1.csv')
watch_round1_session2 = pd.read_csv('watch_round1_session2.csv')
watch_round1_session3 = pd.read_csv('watch_round1_session3.csv')
watch_round2_session1 = pd.read_csv('watch_round2_session1.csv')
watch_round2_session2 = pd.read_csv('watch_round2_session2.csv')
watch_round2_session3 = pd.read_csv('watch_round2_session3.csv')
round1_files = [watch_round1_session1, watch_round1_session2, watch_round1_session3]
round2_files = [watch_round2_session1, watch_round2_session2, watch_round2_session3]
# Load session activity log
with open("sessions_log.json", "r") as f:
session_log = json.loads(f.read())
### Constants and Functions ###
# Two rounds of data collection
rounds = ['round1', 'round2']
# For each data collection round there was 1 user participating
users_per_round = 1
# For each data collection round there were 3 separate sessions
sessons_per_round = 3
# Timestamp format to follow
time_format = '%Y-%m-%d %H:%M:%S.%f'
# Columns to use from the csv data
session_sensor_data_columns = [
"loggingTime(txt)",
"motionRotationRateX(rad/s)",
"motionRotationRateY(rad/s)",
"motionRotationRateZ(rad/s)",
"motionUserAccelerationX(G)",
"motionUserAccelerationY(G)",
"motionUserAccelerationZ(G)"
]
# Helper functions
def convert_timestamp(x):
return datetime.fromtimestamp(x).strftime(time_format)
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
def make_delta(time):
h, m, seconds = time.split(':')
s, milli = seconds.split('.')
milli = milli + str(0)
return timedelta(hours=int(h), minutes=int(m), seconds=int(s), milliseconds=int(milli))
################################
# Load Activity Data
# Iterate through all user session logs
# Map to the proper activity label
# Concatenate into a single activity dataframe
session_id = 0 # Keep track of session id: unique to each user file
activity_data = pd.DataFrame()
# For each round of data collection
for rnd in rounds:
print(f'Parsing {rnd} activity data', flush=True)
# Grab the files for this round
if(rnd == 'round1'):
rnd_files = round1_files
else:
rnd_files = round2_files
# Group files for the round by session
rnd_session_files = list(chunks(rnd_files, users_per_round))
# Should have the right number of sessions, each file within a session is a unique user
assert len(rnd_session_files) == sessons_per_round
# For each session (3) within the round
for session in range(sessons_per_round):
print(f'Parsing session {session + 1} data files', flush=True)
# Grab the user activity log files for this session
session_files = rnd_session_files[session]
# Grab the activity labels for this session
session_log_data = session_log[rnd]['session' + str(1+session)]
# For each user
for user_file in session_files:
# Load user file for this session - fix timestamp and add session id
user_log_df = user_file[session_sensor_data_columns]
user_log_df["loggingTime"] = user_log_df["loggingTime(txt)"].apply(lambda x: pd.to_datetime(x).replace(tzinfo=None))
user_log_df.drop("loggingTime(txt)", axis=1, inplace=True)
user_log_df["sessionId"] = session_id
session_id += 1
# Convert timestamp and make sure it's ordered appropriately
user_log_df.sort_values(by="loggingTime", ascending=True)
first_val = user_log_df["loggingTime"][0]
# Get the logs that contain the activity labels for this session
user_session_activity_df = pd.DataFrame({
'activity': pd.Series([s[0] for s in session_log_data], dtype=str),
'loggingTime': pd.Series([first_val + make_delta(s[1]) for s in session_log_data])
}).sort_values(by="loggingTime", ascending=True)
# Fuzzy merge on timestamps to map user logs to activity labels
user_log_cleaned = pd.merge_asof(
left=user_log_df,
right=user_session_activity_df,
on='loggingTime',
direction='forward'
)
activity_data = pd.concat((activity_data, user_log_cleaned))
# In the end, you will have the fully cleaned and joined activity data file
activity_data.to_csv(r'/content/watch_activity_data.csv')