feat: python识别版权图片和文字

1 year ago · d291a847e5
parent bcb6b74442
commit d291a847e5
4 changed files with 91 additions and 9 deletions
--- a/assets/copyright-img/1ppt.png
+++ b/assets/copyright-img/1ppt.png
--- a/assets/copyright-img/缘梦.png
+++ b/assets/copyright-img/缘梦.png
--- a/config/index.js
+++ b/config/index.js
@ -10,7 +10,7 @@ const config = {
         * 则直接使用inputDir作为工作目录
         */
        inputDir: '',
-        // inputDir: 'E:/商品资料汇总/商品资料(1051-1100)/ppt1056-研究生复试',
+        // inputDir: 'C:/Users/Administrator/Desktop/test',
        // 指定要查找的目录
        directoryPath: 'E:/商品资料汇总/商品资料(1201-1250)',
        // 要转换的文件个数, 0表示全部转换
--- a/py-src/del_copyright_img.py
+++ b/py-src/del_copyright_img.py
@ -1,17 +1,45 @@
 # 依赖目录pip install python-pptx Pillow imagehash
 from pptx import Presentation
 from pptx.util import Inches
+from pptx.dml.color import RGBColor
 from PIL import Image
 import imagehash
 import os
+from config import Config

-def is_similar_image(image_path, reference_image_path, similarity_threshold=0.7):
+def get_copyright_image_paths():
+    # 构建 assets/copyright-img 的相对路径
+    copyright_dir = os.path.join(
+        os.path.dirname(__file__), "..", "assets", "copyright-img"
+    )
+    
+    # 获取所有图片文件的路径
+    copyright_image_paths = []
+    for filename in os.listdir(copyright_dir):
+        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # 支持的图片格式
+            copyright_image_paths.append(os.path.join(copyright_dir, filename))
+    
+    return copyright_image_paths
+
+def convert_image_to_rgba(image):
+    """
+    将图像转换为 RGBA 格式以处理透明度
+    """
+    if image.mode == 'P':
+        image = image.convert('RGBA')
+    return image
+
+def is_similar_image(image_path, reference_image_path, similarity_threshold=0.95):
    """
    检查两张图片之间的相似度
    """
    img1 = Image.open(image_path)
    img2 = Image.open(reference_image_path)

+    # 转换图像为 RGBA 格式
+    img1 = convert_image_to_rgba(img1)
+    img2 = convert_image_to_rgba(img2)
+
    # 计算哈希值
    hash1 = imagehash.average_hash(img1)
    hash2 = imagehash.average_hash(img2)
@ -26,6 +54,11 @@ def remove_copyright_images(pptx_path, copyright_image_paths):

    # 使用一个列表来记录要删除的幻灯片索引
    slides_to_delete = []
+    change_text = False
+    # 定义要替换的关键字数组
+    replace_text_keywords = ["第一PPT"]
+    # 定义要删除的关键字符数组
+    del_text_keywords = ["Speaker name and title", "OfficePLUS", "Presenter name", "www.officeplus.cn"]

    # 遍历每一页幻灯片
    for index, slide in enumerate(prs.slides):
@ -41,7 +74,7 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
                # 检查相似度
                for copyright_image in copyright_image_paths:
                    if is_similar_image(temp_image_path, copyright_image):
-                        print(f"有相似的图片,在第{index + 1}页")
+                        print(f"'{os.path.basename(pptx_path)}'有相似的图片'{os.path.basename(copyright_image)}',在第{index + 1}页")
                        # 删除该形状
                        slide.shapes._spTree.remove(shape._element)

@ -51,20 +84,69 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
                # 删除临时图片
                os.remove(temp_image_path)

+             # 检查形状是否为文本框
+            if shape.has_text_frame:
+                text_content = shape.text
+
+                # 检查文本内容是否包含要删除的关键字符数组中的任意字符
+                if any(char in text_content for char in del_text_keywords):
+                    if not change_text:
+                        change_text = True
+                    print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页删除文本框，内容为: '{text_content}'")
+                    # 删除文本框
+                    slide.shapes._spTree.remove(shape._element)
+
+                """
+                # 判断并替换文本
+                for keyword in replace_text_keywords:
+                    if keyword in text_content:
+                        if not change_text:
+                            change_text = True
+                        print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页替换文本 '{text_content}' 为 '创意素材'")
+                        
+                        # 记录原字体样式
+                        original_font = shape.text_frame.paragraphs[0].runs[0].font
+
+                        new_text = text_content.replace(keyword, "创意素材")
+                        shape.text = new_text
+
+                        # 应用原字体样式
+                        for paragraph in shape.text_frame.paragraphs:
+                            for run in paragraph.runs:
+                                run.font.bold = original_font.bold
+                                run.font.italic = original_font.italic
+                                run.font.underline = original_font.underline
+                                run.font.size = original_font.size
+
+                                # 处理颜色
+                                if isinstance(original_font.color, RGBColor):
+                                    run.font.color.rgb = original_font.color.rgb
+                                elif hasattr(original_font.color, 'theme_color'):
+                                    run.font.color.rgb = RGBColor(255, 255, 255)
+                                else:
+                                    # 处理未识别的颜色类型
+                                    run.font.color.rgb = RGBColor(255, 255, 255)
+                """
+
    # 删除记录中的幻灯片
    for index in slides_to_delete:
        slides = list(prs.slides._sldIdLst)
        current_slide_to_delete = slides[index]
        prs.slides._sldIdLst.remove(current_slide_to_delete)

-    if(len(slides_to_delete) > 0):
+    if(len(slides_to_delete) > 0 or change_text):
        # 保存修改后的 PowerPoint 文件
        prs.save(pptx_path)
    else:
-        print(f"{pptx_path}没有版权信息，不需要删除")
+        print(f"{pptx_path}没有版权信息或者版权文案，不需要删除替换")
+
+def process_ppt_files(directory):
+    copyright_image_paths = get_copyright_image_paths()  # 获取版权图片路径列表

-# 使用示例
-pptx_file_path = '/Users/minya/Desktop/test/1.pptx'  # 输入 PowerPoint 文件路径
-copyright_image_paths = ['/Users/minya/Desktop/test/1.png']  # 版权图片路径列表
+    # 遍历指定目录下的所有文件
+    for filename in os.listdir(directory):
+        if filename.lower().endswith(('.pptx', '.ppt')):  # 只处理 pptx 和 ppt 文件
+            pptx_file_path = os.path.join(directory, filename)
+            remove_copyright_images(pptx_file_path, copyright_image_paths)

-remove_copyright_images(pptx_file_path, copyright_image_paths)
+process_ppt_files(Config.get_latest_folder(Config.WORK_PATH))