-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
73 lines (65 loc) · 2.77 KB
/
config.yaml
File metadata and controls
73 lines (65 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# production_config.yaml
task_generation:
min_tasks_per_subject: 50000 # Minimum tasks to accumulate before flushing to a new file
max_tasks_per_subject: 100000 # Maximum tasks per Parquet file; flush when reached
batch_size: 200 # Number of tasks generated per batch
quality_threshold: 0.5 # Discard tasks scoring below this threshold (range 0 to 1)
bias_detection:
threshold: 0.1 # Allowed overrepresentation ratio per domain (10% over expected share)
max_rebalancing_attempts: 5 # Maximum number of rebalancing attempts before early exit
min_tasks_before_rebalancing: 5000 # Minimum tasks needed before bias checks are enforced
user_confirmation: false # No interactive confirmations in production
summarization:
enabled: true # Enable summarization for long text chunks
model_name: "facebook/bart-large-cnn" # Summarization model for context reduction
max_length: 130 # Maximum summary length
min_length: 30 # Minimum summary length
device_config:
device: "auto" # Auto-select device ("auto", "cpu", or "cuda")
enable_fp16: true # Use FP16 on supported GPUs
logging:
level: "INFO" # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
log_file: "/var/log/task_generation.log" # File path for log output
enable_console: false # Disable console logging for production
rotate: true # Enable log rotation
max_file_size: "100MB" # Maximum log file size before rotation
backup_count: 5 # Number of rotated log files to retain
parallel_processing:
enabled: true # Enable concurrent processing of files
num_workers: 8 # Number of worker threads for parallel processing
resource_limits:
max_memory_mb: 8192 # Exit if memory usage exceeds 8GB
max_cpu_percent: 85 # Exit if CPU usage exceeds 85%
storage:
input_folder: "/data/documents" # Folder path where input documents reside
output_folder: "/data/parquet_output" # Folder path for saving generated Parquet files
subject_extraction:
keywords:
Regulatory:
- "regulation"
- "compliance"
- "policy"
- "faa"
Security:
- "attack"
- "risk"
- "vulnerability"
- "threat"
Finance:
- "finance"
- "investment"
- "bank"
IT:
- "software"
- "technology"
- "it"
Operations:
- "operation"
- "supply chain"
- "logistics"
Sales:
- "sales"
- "revenue"
- "customer"
General:
- "general"