1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
import os
import subprocess
from funasr import AutoModel
# =========================
# 固定配置区
# =========================
INPUT_FILE = "a.m4a"
OUTPUT_PREFIX = None # None 表示自动使用输入文件名,例如 a.m4a -> a.srt/a.vtt/a.txt
DEVICE = "cuda:0" # 可改为 "cpu"
MAX_CHARS = 18
MAX_DURATION = 5000
HOTWORD = "" # 多个热词用空格分开,例如 "亚洲 欧洲"
def ms_to_srt_time(ms):
ms = int(ms)
h = ms // 3600000
ms %= 3600000
m = ms // 60000
ms %= 60000
s = ms // 1000
ms %= 1000
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
def ms_to_vtt_time(ms):
return ms_to_srt_time(ms).replace(",", ".")
def extract_audio(input_path, wav_path):
cmd = [
"ffmpeg", "-y",
"-i", input_path,
"-vn",
"-ac", "1",
"-ar", "16000",
wav_path,
]
subprocess.run(cmd, check=True)
def split_text_by_timestamp(text, timestamps):
chars = text.replace(" ", "")
n = min(len(chars), len(timestamps))
items = []
for i in range(n):
items.append({
"text": chars[i],
"start": timestamps[i][0],
"end": timestamps[i][1],
})
return items
def merge_chars_to_subtitles(items, max_chars=18, max_duration=5000):
subtitles = []
buf = []
start = None
end = None
puncts = "。!?!?;;,,、"
for item in items:
if start is None:
start = item["start"]
buf.append(item["text"])
end = item["end"]
text_now = "".join(buf)
duration = end - start
should_cut = (
len(text_now) >= max_chars
or duration >= max_duration
or item["text"] in puncts
)
if should_cut:
subtitles.append((start, end, text_now))
buf = []
start = None
end = None
if buf:
subtitles.append((start, end, "".join(buf)))
return subtitles
def write_srt(subtitles, out_path):
with open(out_path, "w", encoding="utf-8") as f:
for i, (start, end, text) in enumerate(subtitles, 1):
f.write(f"{i}\n")
f.write(f"{ms_to_srt_time(start)} --> {ms_to_srt_time(end)}\n")
f.write(f"{text}\n\n")
def write_vtt(subtitles, out_path):
with open(out_path, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
for start, end, text in subtitles:
f.write(f"{ms_to_vtt_time(start)} --> {ms_to_vtt_time(end)}\n")
f.write(f"{text}\n\n")
def write_txt(text, out_path):
with open(out_path, "w", encoding="utf-8") as f:
f.write(text.replace(" ", "") + "\n")
def main():
input_path = INPUT_FILE
if not os.path.exists(input_path):
raise FileNotFoundError(f"输入文件不存在:{input_path}")
base = OUTPUT_PREFIX or os.path.splitext(input_path)[0]
audio_exts = [".wav", ".mp3", ".flac", ".m4a", ".aac", ".ogg"]
video_exts = [".mp4", ".mkv", ".avi", ".mov", ".flv", ".wmv", ".webm"]
ext = os.path.splitext(input_path)[1].lower()
if ext in video_exts:
wav_path = base + "_16k.wav"
print(f"检测到视频文件,正在提取音频:{wav_path}")
extract_audio(input_path, wav_path)
elif ext in audio_exts:
wav_path = base + "_16k.wav"
print(f"检测到音频文件,正在转换为 16 kHz 单声道 wav:{wav_path}")
extract_audio(input_path, wav_path)
else:
raise ValueError(f"不支持的文件格式:{ext}")
model = AutoModel(
model="paraformer-zh",
model_revision="v2.0.4",
vad_model="fsmn-vad",
vad_model_revision="v2.0.4",
punc_model="ct-punc-c",
punc_model_revision="v2.0.4",
device=DEVICE,
disable_update=True,
)
res = model.generate(
input=wav_path,
batch_size_s=300,
hotword=HOTWORD,
)
if not res:
raise RuntimeError("FunASR 没有返回识别结果。")
result = res[0]
text = result.get("text", "")
timestamps = result.get("timestamp", [])
txt_path = base + ".txt"
srt_path = base + ".srt"
vtt_path = base + ".vtt"
write_txt(text, txt_path)
if timestamps:
items = split_text_by_timestamp(text, timestamps)
subtitles = merge_chars_to_subtitles(
items,
max_chars=MAX_CHARS,
max_duration=MAX_DURATION,
)
write_srt(subtitles, srt_path)
write_vtt(subtitles, vtt_path)
print("识别完成:")
print(srt_path)
print(vtt_path)
print(txt_path)
else:
print("识别完成,但当前结果没有 timestamp,只生成 txt:")
print(txt_path)
if __name__ == "__main__":
main()
|