sharp_img/py-src/del_copyright_img.py

# 依赖目录pip install python-pptx Pillow imagehash
from pptx import Presentation
from pptx.util import Inches
from pptx.dml.color import RGBColor
from PIL import Image
import imagehash
import os
from config import Config
from pptx.opc.constants import RELATIONSHIP_TYPE as RT
from pptx.oxml import parse_xml

def get_copyright_image_paths():
    # 构建 assets/copyright-img 的相对路径
    copyright_dir = os.path.join(
        os.path.dirname(__file__), "..", "assets", "copyright-img"
    )

    # 获取所有图片文件的路径
    copyright_image_paths = []
    for filename in os.listdir(copyright_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # 支持的图片格式
            copyright_image_paths.append(os.path.join(copyright_dir, filename))

    return copyright_image_paths

def convert_image_to_rgba(image):
    """
    将图像转换为 RGBA 格式以处理透明度
    """
    if image.mode == 'P':
        image = image.convert('RGBA')
    return image

def is_similar_image(image_path, reference_image_path, similarity_threshold=0.95):
    """
    检查两张图片之间的相似度
    """
    img1 = Image.open(image_path)
    img2 = Image.open(reference_image_path)

    # 转换图像为 RGBA 格式
    img1 = convert_image_to_rgba(img1)
    img2 = convert_image_to_rgba(img2)

    # 计算哈希值
    hash1 = imagehash.average_hash(img1)
    hash2 = imagehash.average_hash(img2)

    # 计算相似度（1 - 汉明距离 / 哈希位数）
    similarity = 1 - (hash1 - hash2) / len(hash1.hash) ** 2
    return similarity >= similarity_threshold

def remove_copyright_images(pptx_path, copyright_image_paths):
    # 打开 PowerPoint 文件
    prs = Presentation(pptx_path)

    # 使用一个列表来记录要删除的幻灯片索引
    slides_to_delete = []
    change_text = False
    # 定义要替换的关键字数组
    replace_text_keywords = ["第一PPT"]
    # 定义要删除的关键字符数组(只删除文本框)
    del_text_keywords = [
        "Speaker name and title",
        "OfficePLUS", "Presenter name",
        "www.officeplus.cn",
        "第一PPT模板网-WWW.1PPT.COM",
        "标题字体来源于字魂网，未经授权不可商用",
        "以上字体来源字魂网，未经授权不得商用"
    ]
    # 定义要删除的关键字符数组(删除整页幻灯片)
    del_text_keywords_slide = ["模板中使用的字体为开源字体","一站式办公内容服务平台", "欢迎关注OfficePLUS官方渠道"]

    # 遍历所有幻灯片母版
    # for sld_mstr in prs.slide_masters:
    #     theme_part = sld_mstr.part.part_related_by(RT.THEME)
    #     theme = parse_xml(theme_part.blob)
    #     sld_mstr.name = theme.get("name")
    #     print(sld_mstr.name)

    # 遍历每一页幻灯片
    for index, slide in enumerate(prs.slides):
        # 遍历幻灯片中的每个形状
        for shape in slide.shapes:

            # 检查形状是否为图片
            if shape.shape_type == 13:  # 13 表示图片
                # 临时保存图片
                temp_image_path = 'temp_image.png'
                with open(temp_image_path, 'wb') as f:
                    f.write(shape.image.blob)

                # 检查相似度
                for copyright_image in copyright_image_paths:
                    if is_similar_image(temp_image_path, copyright_image):
                        print(f"'{os.path.basename(pptx_path)}'有相似的图片'{os.path.basename(copyright_image)}',在第{index + 1}页")
                        # 删除该形状
                        slide.shapes._spTree.remove(shape._element)

                        slides_to_delete.append(index)  # 记录要删除的幻灯片
                        break  # 找到相似图片后可以跳出循环

                # 删除临时图片
                os.remove(temp_image_path)

             # 检查形状是否为文本框
            if shape.has_text_frame:
                text_content = shape.text

                # 检查文本内容是否包含要删除的关键字符数组中的任意字符
                if any(char in text_content for char in del_text_keywords_slide):
                    print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页需要删除，内容为: '{text_content}'")
                    slides_to_delete.append(index)
                    break

                # 检查文本内容是否包含要删除的关键字符数组中的任意字符
                if any(char in text_content for char in del_text_keywords):
                    if not change_text:
                        change_text = True
                    print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页删除文本框，内容为: '{text_content}'")
                    # 删除文本框
                    slide.shapes._spTree.remove(shape._element)

                """
                # 判断并替换文本
                for keyword in replace_text_keywords:
                    if keyword in text_content:
                        if not change_text:
                            change_text = True
                        print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页替换文本 '{text_content}' 为 '创意素材'")

                        # 记录原字体样式
                        original_font = shape.text_frame.paragraphs[0].runs[0].font

                        new_text = text_content.replace(keyword, "创意素材")
                        shape.text = new_text

                        # 应用原字体样式
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                run.font.bold = original_font.bold
                                run.font.italic = original_font.italic
                                run.font.underline = original_font.underline
                                run.font.size = original_font.size

                                # 处理颜色
                                if isinstance(original_font.color, RGBColor):
                                    run.font.color.rgb = original_font.color.rgb
                                elif hasattr(original_font.color, 'theme_color'):
                                    run.font.color.rgb = RGBColor(255, 255, 255)
                                else:
                                    # 处理未识别的颜色类型
                                    run.font.color.rgb = RGBColor(255, 255, 255)
                """

    # 删除记录中的幻灯片
    for index in slides_to_delete:
        slides = list(prs.slides._sldIdLst)
        current_slide_to_delete = slides[index]
        prs.slides._sldIdLst.remove(current_slide_to_delete)

    if(len(slides_to_delete) > 0 or change_text):
        # 保存修改后的 PowerPoint 文件
        prs.save(pptx_path)
    else:
        print(f"{pptx_path}没有版权信息或者版权文案，不需要删除替换")

def process_ppt_files(directory):
    copyright_image_paths = get_copyright_image_paths()  # 获取版权图片路径列表

    # 遍历指定目录下的所有文件
    for filename in os.listdir(directory):
        # 过滤掉以 ~$ 开头的文件
        if filename.startswith('~$'):
            continue

        if filename.lower().endswith(('.pptx', '.ppt')):  # 只处理 pptx 和 ppt 文件
            pptx_file_path = os.path.join(directory, filename)
            remove_copyright_images(pptx_file_path, copyright_image_paths)

process_ppt_files(Config.get_latest_folder(Config.WORK_PATH))