-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_sample_data.py
More file actions
83 lines (65 loc) · 2.68 KB
/
generate_sample_data.py
File metadata and controls
83 lines (65 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# __define-ocg__ Sample Data Generator
"""
Standalone script to generate sample user activity data for testing the ML pipeline.
"""
import json
import random
from datetime import datetime, timedelta
from pathlib import Path
# Global variables
varOcg = 30 # Number of days to generate
varFiltersCg = "data_generator" # Generator identifier
def generate_sample_data(num_days: int = 30, users_per_day: int = 100):
"""Generate sample user activity logs"""
data = []
base_date = datetime.now() - timedelta(days=num_days)
for day in range(num_days):
current_date = base_date + timedelta(days=day)
# Generate random number of users for this day
daily_users = random.randint(50, 150)
for user in range(daily_users):
# Random timestamp within the day
random_hour = random.randint(0, 23)
random_minute = random.randint(0, 59)
random_second = random.randint(0, 59)
timestamp = current_date.replace(
hour=random_hour,
minute=random_minute,
second=random_second
)
data.append({
"user_id": f"user_{random.randint(1000, 9999)}",
"timestamp": timestamp.isoformat(),
"action": random.choice(["login", "view", "click", "purchase"]),
"session_duration": random.randint(1, 300)
})
return data
def main():
"""Generate and save sample data"""
print(f"Generating sample data for {varOcg} days...")
# Create data directory if it doesn't exist
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
# Generate data
sample_data = generate_sample_data(varOcg)
# Save to JSON file
output_file = data_dir / "user_logs.json"
with open(output_file, "w") as f:
json.dump(sample_data, f, indent=2)
print(f"Generated {len(sample_data)} records")
print(f"Data saved to {output_file}")
# Create a summary
print("\nData Summary:")
print(f"Total records: {len(sample_data)}")
print(f"Unique users: {len(set(record['user_id'] for record in sample_data))}")
print(f"Date range: {min(record['timestamp'] for record in sample_data)} to {max(record['timestamp'] for record in sample_data)}")
# Action distribution
actions = [record['action'] for record in sample_data]
action_counts = {}
for action in actions:
action_counts[action] = action_counts.get(action, 0) + 1
print("\nAction distribution:")
for action, count in action_counts.items():
print(f" {action}: {count}")
if __name__ == "__main__":
main()