whisper,提取视频文案

import os
import subprocess
import whisper
import torch
from zhconv import convert
from tqdm import tqdm


def extract_audio(video_file):
    base_name = os.path.splitext(video_file)[0]
    audio_file = f"{base_name}.wav"
    command = f'ffmpeg -i "{video_file}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "{audio_file}"'
    try:
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"提取音频时出错: {e}")
    return audio_file


def transcribe_audio(audio_file, model):
    base_name = os.path.splitext(audio_file)[0]
    output_txt_file = f"{base_name}.txt"
    # 尝试指定语言为中文并使用 fp16 为 False
    result = model.transcribe(audio_file, language='zh', fp16=False)
    text = result["text"]
    # 将繁体中文转换为简体中文
    simplified_text = convert(text, 'zh-cn')
    with open(output_txt_file, 'w', encoding='utf-8') as f:
        f.write(simplified_text)
    print(f"已将 {audio_file} 的识别结果保存到 {output_txt_file}")
    return output_txt_file


def batch_transcribe(model_name='medium', use_gpu=False):
    device = 'cuda' if use_gpu and torch.cuda.is_available() else 'cpu'
    model = whisper.load_model(model_name, device=device)
    current_dir = os.getcwd()
    media_files = []
    for file in os.listdir(current_dir):
        file_ext = os.path.splitext(file)[1].lower()
        if file_ext in ['.mp4', '.mov', '.avi', '.wav', '.mp3', '.m4a']:
            media_files.append(file)

    for file in tqdm(media_files, desc="处理进度", unit="文件"):
        file_ext = os.path.splitext(file)[1].lower()
        if file_ext in ['.mp4', '.mov', '.avi']:
            audio_file = extract_audio(file)
            transcribe_audio(audio_file, model)
            if os.path.exists(audio_file):
                os.remove(audio_file)
        elif file_ext in ['.wav', '.mp3', '.m4a']:
            transcribe_audio(file, model)


if __name__ == "__main__":
    model_to_use = 'medium'
    use_gpu = False
    batch_transcribe(model_name=model_to_use, use_gpu=use_gpu)
    

Leave a Reply