import os
import subprocess
import whisper
import torch
from zhconv import convert
from tqdm import tqdm
def extract_audio(video_file):
base_name = os.path.splitext(video_file)[0]
audio_file = f"{base_name}.wav"
command = f'ffmpeg -i "{video_file}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "{audio_file}"'
try:
subprocess.run(command, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"提取音频时出错: {e}")
return audio_file
def transcribe_audio(audio_file, model):
base_name = os.path.splitext(audio_file)[0]
output_txt_file = f"{base_name}.txt"
# 尝试指定语言为中文并使用 fp16 为 False
result = model.transcribe(audio_file, language='zh', fp16=False)
text = result["text"]
# 将繁体中文转换为简体中文
simplified_text = convert(text, 'zh-cn')
with open(output_txt_file, 'w', encoding='utf-8') as f:
f.write(simplified_text)
print(f"已将 {audio_file} 的识别结果保存到 {output_txt_file}")
return output_txt_file
def batch_transcribe(model_name='medium', use_gpu=False):
device = 'cuda' if use_gpu and torch.cuda.is_available() else 'cpu'
model = whisper.load_model(model_name, device=device)
current_dir = os.getcwd()
media_files = []
for file in os.listdir(current_dir):
file_ext = os.path.splitext(file)[1].lower()
if file_ext in ['.mp4', '.mov', '.avi', '.wav', '.mp3', '.m4a']:
media_files.append(file)
for file in tqdm(media_files, desc="处理进度", unit="文件"):
file_ext = os.path.splitext(file)[1].lower()
if file_ext in ['.mp4', '.mov', '.avi']:
audio_file = extract_audio(file)
transcribe_audio(audio_file, model)
if os.path.exists(audio_file):
os.remove(audio_file)
elif file_ext in ['.wav', '.mp3', '.m4a']:
transcribe_audio(file, model)
if __name__ == "__main__":
model_to_use = 'medium'
use_gpu = False
batch_transcribe(model_name=model_to_use, use_gpu=use_gpu)
Leave a Reply
You must be logged in to post a comment.