-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstoreVectorBase.py
More file actions
38 lines (33 loc) · 1.34 KB
/
storeVectorBase.py
File metadata and controls
38 lines (33 loc) · 1.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
'''
Author: RedKold redkold233@gmail.com
Date: 2024-08-26 16:10:22
LastEditors: RedKold redkold233@gmail.com
LastEditTime: 2024-08-26 16:10:28
FilePath: \SRT-learner\storeVectorBase.py
Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
'''
import pickle
import os
from sentence_transformers import SentenceTransformer
def generate_and_save_embeddings(srt_file, model_name='paraphrase-multilingual-MiniLM-L12-v2', db_file='subtitle_db.pkl'):
# 初始化模型
model = SentenceTransformer(model_name)
# 读取 SRT 文件并处理
import pysrt
subs = pysrt.open(srt_file, encoding='utf-8')
lines = [sub.text.replace('\n', ' ') for sub in subs]
timestamps = [f"{sub.start} --> {sub.end}" for sub in subs]
# 生成嵌入向量
embeddings = model.encode(lines, convert_to_tensor=True)
# 存储数据到本地文件
with open(db_file, 'wb') as db:
pickle.dump({
'lines': lines,
'timestamps': timestamps,
'embeddings': embeddings
}, db)
print(f"数据已保存到 {db_file}")
if __name__ == "__main__":
# 生成并保存字幕数据库
srt_file = 'srt/Frozen_chs.srt' # 替换为你的 SRT 文件路径
generate_and_save_embeddings(srt_file)