feat: python识别版权图片和文字

1 year ago · d291a847e5
parent bcb6b74442
commit d291a847e5
4 changed files with 91 additions and 9 deletions
--- a/assets/copyright-img/1ppt.png
+++ b/assets/copyright-img/1ppt.png
--- a/assets/copyright-img/缘梦.png
+++ b/assets/copyright-img/缘梦.png
--- a/config/index.js
+++ b/config/index.js
@ -10,7 +10,7 @@ const config = {
         * 则直接使用inputDir作为工作目录
         */
        inputDir: '',
-        // inputDir: 'E:/商品资料汇总/商品资料(1051-1100)/ppt1056-研究生复试',
+        // inputDir: 'C:/Users/Administrator/Desktop/test',
        // 指定要查找的目录
        directoryPath: 'E:/商品资料汇总/商品资料(1201-1250)',
        // 要转换的文件个数, 0表示全部转换
--- a/py-src/del_copyright_img.py
+++ b/py-src/del_copyright_img.py
@ -1,17 +1,45 @@
 # 依赖目录pip install python-pptx Pillow imagehash
 from pptx import Presentation
 from pptx.util import Inches
 from pptx.dml.color import RGBColor
 from PIL import Image
 import imagehash
 import os
 from config import Config
-def is_similar_image(image_path, reference_image_path, similarity_threshold=0.7):
+def get_copyright_image_paths():
    # 构建 assets/copyright-img 的相对路径
    copyright_dir = os.path.join(
        os.path.dirname(__file__), "..", "assets", "copyright-img"
    )
    # 获取所有图片文件的路径
    copyright_image_paths = []
    for filename in os.listdir(copyright_dir):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # 支持的图片格式
            copyright_image_paths.append(os.path.join(copyright_dir, filename))
    return copyright_image_paths
 def convert_image_to_rgba(image):
    """
    将图像转换为 RGBA 格式以处理透明度
    """
    if image.mode == 'P':
        image = image.convert('RGBA')
    return image
 def is_similar_image(image_path, reference_image_path, similarity_threshold=0.95):
    """
    检查两张图片之间的相似度
    """
    img1 = Image.open(image_path)
    img2 = Image.open(reference_image_path)
    # 转换图像为 RGBA 格式
    img1 = convert_image_to_rgba(img1)
    img2 = convert_image_to_rgba(img2)
    # 计算哈希值
    hash1 = imagehash.average_hash(img1)
    hash2 = imagehash.average_hash(img2)
@ -26,6 +54,11 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
    # 使用一个列表来记录要删除的幻灯片索引
    slides_to_delete = []
    change_text = False
    # 定义要替换的关键字数组
    replace_text_keywords = ["第一PPT"]
    # 定义要删除的关键字符数组
    del_text_keywords = ["Speaker name and title", "OfficePLUS", "Presenter name", "www.officeplus.cn"]
    # 遍历每一页幻灯片
    for index, slide in enumerate(prs.slides):
@ -41,7 +74,7 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
                # 检查相似度
                for copyright_image in copyright_image_paths:
                    if is_similar_image(temp_image_path, copyright_image):
-                        print(f"有相似的图片,在第{index + 1}页")
+                        print(f"'{os.path.basename(pptx_path)}'有相似的图片'{os.path.basename(copyright_image)}',在第{index + 1}页")
                        # 删除该形状
                        slide.shapes._spTree.remove(shape._element)
@ -51,20 +84,69 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
                # 删除临时图片
                os.remove(temp_image_path)
             # 检查形状是否为文本框
            if shape.has_text_frame:
                text_content = shape.text
                # 检查文本内容是否包含要删除的关键字符数组中的任意字符
                if any(char in text_content for char in del_text_keywords):
                    if not change_text:
                        change_text = True
                    print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页删除文本框，内容为: '{text_content}'")
                    # 删除文本框
                    slide.shapes._spTree.remove(shape._element)
                """
                # 判断并替换文本
                for keyword in replace_text_keywords:
                    if keyword in text_content:
                        if not change_text:
                            change_text = True
                        print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页替换文本 '{text_content}' 为 '创意素材'")
                        # 记录原字体样式
                        original_font = shape.text_frame.paragraphs[0].runs[0].font
                        new_text = text_content.replace(keyword, "创意素材")
                        shape.text = new_text
                        # 应用原字体样式
                        for paragraph in shape.text_frame.paragraphs:
                            for run in paragraph.runs:
                                run.font.bold = original_font.bold
                                run.font.italic = original_font.italic
                                run.font.underline = original_font.underline
                                run.font.size = original_font.size
                                # 处理颜色
                                if isinstance(original_font.color, RGBColor):
                                    run.font.color.rgb = original_font.color.rgb
                                elif hasattr(original_font.color, 'theme_color'):
                                    run.font.color.rgb = RGBColor(255, 255, 255)
                                else:
                                    # 处理未识别的颜色类型
                                    run.font.color.rgb = RGBColor(255, 255, 255)
                """
    # 删除记录中的幻灯片
    for index in slides_to_delete:
        slides = list(prs.slides._sldIdLst)
        current_slide_to_delete = slides[index]
        prs.slides._sldIdLst.remove(current_slide_to_delete)
-    if(len(slides_to_delete) > 0):
+    if(len(slides_to_delete) > 0 or change_text):
        # 保存修改后的 PowerPoint 文件
        prs.save(pptx_path)
    else:
-        print(f"{pptx_path}没有版权信息，不需要删除")
+        print(f"{pptx_path}没有版权信息或者版权文案，不需要删除替换")
 def process_ppt_files(directory):
    copyright_image_paths = get_copyright_image_paths()  # 获取版权图片路径列表
-# 使用示例
+    # 遍历指定目录下的所有文件
-pptx_file_path = '/Users/minya/Desktop/test/1.pptx'  # 输入 PowerPoint 文件路径
+    for filename in os.listdir(directory):
-copyright_image_paths = ['/Users/minya/Desktop/test/1.png']  # 版权图片路径列表
+        if filename.lower().endswith(('.pptx', '.ppt')):  # 只处理 pptx 和 ppt 文件
            pptx_file_path = os.path.join(directory, filename)
            remove_copyright_images(pptx_file_path, copyright_image_paths)
-remove_copyright_images(pptx_file_path, copyright_image_paths)
+process_ppt_files(Config.get_latest_folder(Config.WORK_PATH))