pdf切割成多个

Categories: Tools

1. 按 590 页分割：在 `split_pdf` 函数中，使用 `while` 循环按 590 页一份对文件进行分割，直到处理完所有页面。

2. 检查文件大小：对于每个分割出的子文件，将其保存为临时文件 `temp.pdf`，并检查其大小是否超过 180MB。

3. 进一步平分页面：若子文件大小超过 180MB，则通过递归的方式进一步平分页面，确保每个子文件大小不超过 180MB。

4. 多线程处理：`split_all_pdfs_in_directory` 函数使用 8 个线程并行处理指定目录中的所有 PDF 文件，提高处理效率。通过以上逻辑，代码可以确保每个分割后的子 PDF 文件既不超过 590 页，也不超过 180MB。

xxx

pip install PyMuPDF

文件的页面大于600页，或者文件大于150m。被分割的pdf源文件剪切到文件夹 splitted_pdfs

import os
import fitz


def split_pdf(file_path, chunk_size=550):
    """
    将 PDF 文件分割成多个子文件，先分割大小超过 150MB 的，再分割页面超过 550 页的

    参数:
    file_path: 原始 PDF 文件路径
    chunk_size: 每个子文件的页面数量
    """
    # 获取文件大小
    file_size = os.path.getsize(file_path) / (1024 * 1024)
    # 打开 PDF 文件
    try:
        doc = fitz.open(file_path)
        total_pages = doc.page_count
    except:
        print(f"无法打开文件 {file_path}，请检查文件是否损坏或格式是否正确。")
        return

    print(f"文件 {file_path} 的页面数为 {total_pages}，大小为 {file_size:.2f}MB")

    # 确定是否需要分割
    need_split = file_size > 150 or total_pages > chunk_size
    if not need_split:
        print(f"文件 {file_path} 的大小不超过 150MB 且页面数不超过 {chunk_size}，无需分割")
        doc.close()
        return

    print(f"开始分割文件 {file_path}，总页数: {total_pages}，文件大小: {file_size:.2f}MB")

    # 获取文件名和扩展名
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # 确定输出目录
    output_dir = os.path.join(os.path.dirname(file_path), "splitted_pdfs")
    os.makedirs(output_dir, exist_ok=True)

    part_num = 1
    if file_size > 150 and total_pages <= chunk_size:
        # 文件大小超过 150MB 但页面数少于 550 页，平均分割
        pages_per_part = total_pages // 2 if total_pages % 2 == 0 else total_pages // 2 + 1
        start_page = 0
        while start_page < total_pages:
            end_page = min(start_page + pages_per_part, total_pages)
            new_doc = fitz.open()
            for j in range(start_page, end_page):
                new_doc.insert_pdf(doc, from_page=j, to_page=j)
            output_filename = os.path.join(os.path.dirname(file_path), f"{base_name}_splited{part_num}.pdf")
            # 设置保存选项，启用压缩
            new_doc.save(output_filename, garbage=4, deflate=True)
            new_size = os.path.getsize(output_filename) / (1024 * 1024)
            print(f"创建子文件: {output_filename}，包含页面 {start_page + 1}-{end_page}，大小: {new_size:.2f}MB")
            part_num += 1
            new_doc.close()
            start_page = end_page
    elif file_size > 150:
        # 按大小分割，每个子文件尽量接近 150MB
        page_index = 0
        while page_index < total_pages:
            new_doc = fitz.open()
            current_size = 0
            while page_index < total_pages:
                page = doc.load_page(page_index)
                new_doc.insert_pdf(doc, from_page=page_index, to_page=page_index)
                # 计算插入页面后的临时文件大小（不生成临时文件）
                temp_doc = fitz.open()
                temp_doc.insert_pdf(new_doc, from_page=0, to_page=page_index)
                page_size = len(temp_doc.tobytes()) / (1024 * 1024)
                temp_doc.close()
                if current_size + page_size > 150:
                    break
                current_size += page_size
                page_index += 1

            output_filename = os.path.join(os.path.dirname(file_path), f"{base_name}_splited{part_num}.pdf")
            # 设置保存选项，启用压缩
            new_doc.save(output_filename, garbage=4, deflate=True)
            print(f"创建子文件: {output_filename}，大小: {current_size:.2f}MB")
            part_num += 1
            new_doc.close()
    elif total_pages > chunk_size:
        # 按页面数分割，每个子文件尽量接近 chunk_size 页
        start_page = 0
        while start_page < total_pages:
            end_page = min(start_page + chunk_size, total_pages)
            new_doc = fitz.open()
            for j in range(start_page, end_page):
                new_doc.insert_pdf(doc, from_page=j, to_page=j)
            output_filename = os.path.join(os.path.dirname(file_path), f"{base_name}_splited{part_num}.pdf")
            # 设置保存选项，启用压缩
            new_doc.save(output_filename, garbage=4, deflate=True)
            print(f"创建子文件: {output_filename}，包含页面 {start_page + 1}-{end_page}")
            part_num += 1
            new_doc.close()
            start_page = end_page

    print(f"完成分割文件 {file_path}")
    doc.close()

    # 移动源文件到 splitted_pdfs 文件夹
    new_file_path = os.path.join(output_dir, os.path.basename(file_path))
    try:
        os.rename(file_path, new_file_path)
        print(f"已将源文件 {file_path} 移动到 {new_file_path}")
    except PermissionError:
        print(f"无法移动文件 {file_path} 到 {new_file_path}，该文件可能正在被其他程序使用。")


def split_all_pdfs_in_directory(directory, chunk_size=550):
    """
    分割指定目录中的所有 PDF 文件

    参数:
    directory: 包含 PDF 文件的目录路径
    chunk_size: 每个子文件的页面数量
    """
    # 遍历目录中的所有文件
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)

        # 检查是否为 PDF 文件
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            split_pdf(file_path, chunk_size)


if __name__ == "__main__":
    # 设置目录路径和每部分的页数
    pdf_directory = r"D:\BaiduSyncdisk\Tea\Tea-knowledge茶叶书本"
    pages_per_chunk = 550

    # 分割目录中的所有 PDF 文件
    split_all_pdfs_in_directory(pdf_directory, pages_per_chunk)

python代码，将pdf分割成多个文件。文件夹里面有多个pdf文件，每个pdf文件多达3600个页面以上。写一个代码将每个pdf文件分割成多个pdf子文件，每个子文件的页面数量在550页。pdf子文件名字为源文件名加数字1,2,3,4等。

安装必要的库：pip install PyPDF2
将脚本中的 pdf_directory 变量替换为包含 PDF 文件的目录路径
运行脚本：python split_pdfs.py

import os
from PyPDF2 import PdfReader, PdfWriter

def split_pdf(file_path, chunk_size=550):
    """
    将 PDF 文件分割成多个子文件，每个子文件包含指定数量的页面
    
    参数:
    file_path: 原始 PDF 文件路径
    chunk_size: 每个子文件的页面数量
    """
    # 创建 PDF 阅读器
    with open(file_path, 'rb') as file:
        pdf_reader = PdfReader(file)
        total_pages = len(pdf_reader.pages)
        
        if total_pages <= chunk_size:
            print(f"文件 {file_path} 的页面数不超过 {chunk_size}，无需分割")
            return
        
        print(f"开始分割文件 {file_path}，总页数: {total_pages}")
        
        # 获取文件名和扩展名
        base_name = os.path.splitext(os.path.basename(file_path))[0]
        
        # 确定输出目录
        output_dir = os.path.join(os.path.dirname(file_path), "splitted_pdfs")
        os.makedirs(output_dir, exist_ok=True)
        
        # 分割 PDF
        for i in range(0, total_pages, chunk_size):
            # 创建 PDF 编写器
            pdf_writer = PdfWriter()
            
            # 添加页面
            for j in range(i, min(i + chunk_size, total_pages)):
                pdf_writer.add_page(pdf_reader.pages[j])
            
            # 创建输出文件名
            num = i // chunk_size + 1
            output_filename = os.path.join(output_dir, f"{base_name}_{num}.pdf")
            
            # 写入输出文件
            with open(output_filename, 'wb') as output_file:
                pdf_writer.write(output_file)
            
            print(f"创建子文件: {output_filename}")
        
        print(f"完成分割文件 {file_path}")

def split_all_pdfs_in_directory(directory, chunk_size=550):
    """
    分割指定目录中的所有 PDF 文件
    
    参数:
    directory: 包含 PDF 文件的目录路径
    chunk_size: 每个子文件的页面数量
    """
    # 遍历目录中的所有文件
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        # 检查是否为 PDF 文件
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            split_pdf(file_path, chunk_size)

if __name__ == "__main__":
    # 设置目录路径和每部分的页数
    pdf_directory = r"C:\Users\Administrator\MinerU"  # 替换为你的 PDF 文件夹路径
    pages_per_chunk = 550
    
    # 分割目录中的所有 PDF 文件
    split_all_pdfs_in_directory(pdf_directory, pages_per_chunk)

Related posts

Leave a Reply Cancel reply