feat: python识别版权图片和文字

main
lichaojun 1 year ago
parent bcb6b74442
commit d291a847e5

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 842 KiB

@ -10,7 +10,7 @@ const config = {
* 则直接使用inputDir作为工作目录 * 则直接使用inputDir作为工作目录
*/ */
inputDir: '', inputDir: '',
// inputDir: 'E:/商品资料汇总/商品资料(1051-1100)/ppt1056-研究生复试', // inputDir: 'C:/Users/Administrator/Desktop/test',
// 指定要查找的目录 // 指定要查找的目录
directoryPath: 'E:/商品资料汇总/商品资料(1201-1250)', directoryPath: 'E:/商品资料汇总/商品资料(1201-1250)',
// 要转换的文件个数, 0表示全部转换 // 要转换的文件个数, 0表示全部转换

@ -1,17 +1,45 @@
# 依赖目录pip install python-pptx Pillow imagehash # 依赖目录pip install python-pptx Pillow imagehash
from pptx import Presentation from pptx import Presentation
from pptx.util import Inches from pptx.util import Inches
from pptx.dml.color import RGBColor
from PIL import Image from PIL import Image
import imagehash import imagehash
import os import os
from config import Config
def is_similar_image(image_path, reference_image_path, similarity_threshold=0.7): def get_copyright_image_paths():
# 构建 assets/copyright-img 的相对路径
copyright_dir = os.path.join(
os.path.dirname(__file__), "..", "assets", "copyright-img"
)
# 获取所有图片文件的路径
copyright_image_paths = []
for filename in os.listdir(copyright_dir):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): # 支持的图片格式
copyright_image_paths.append(os.path.join(copyright_dir, filename))
return copyright_image_paths
def convert_image_to_rgba(image):
"""
将图像转换为 RGBA 格式以处理透明度
"""
if image.mode == 'P':
image = image.convert('RGBA')
return image
def is_similar_image(image_path, reference_image_path, similarity_threshold=0.95):
""" """
检查两张图片之间的相似度 检查两张图片之间的相似度
""" """
img1 = Image.open(image_path) img1 = Image.open(image_path)
img2 = Image.open(reference_image_path) img2 = Image.open(reference_image_path)
# 转换图像为 RGBA 格式
img1 = convert_image_to_rgba(img1)
img2 = convert_image_to_rgba(img2)
# 计算哈希值 # 计算哈希值
hash1 = imagehash.average_hash(img1) hash1 = imagehash.average_hash(img1)
hash2 = imagehash.average_hash(img2) hash2 = imagehash.average_hash(img2)
@ -26,6 +54,11 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 使用一个列表来记录要删除的幻灯片索引 # 使用一个列表来记录要删除的幻灯片索引
slides_to_delete = [] slides_to_delete = []
change_text = False
# 定义要替换的关键字数组
replace_text_keywords = ["第一PPT"]
# 定义要删除的关键字符数组
del_text_keywords = ["Speaker name and title", "OfficePLUS", "Presenter name", "www.officeplus.cn"]
# 遍历每一页幻灯片 # 遍历每一页幻灯片
for index, slide in enumerate(prs.slides): for index, slide in enumerate(prs.slides):
@ -41,7 +74,7 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 检查相似度 # 检查相似度
for copyright_image in copyright_image_paths: for copyright_image in copyright_image_paths:
if is_similar_image(temp_image_path, copyright_image): if is_similar_image(temp_image_path, copyright_image):
print(f"有相似的图片,在第{index + 1}") print(f"'{os.path.basename(pptx_path)}'有相似的图片'{os.path.basename(copyright_image)}',在第{index + 1}")
# 删除该形状 # 删除该形状
slide.shapes._spTree.remove(shape._element) slide.shapes._spTree.remove(shape._element)
@ -51,20 +84,69 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 删除临时图片 # 删除临时图片
os.remove(temp_image_path) os.remove(temp_image_path)
# 检查形状是否为文本框
if shape.has_text_frame:
text_content = shape.text
# 检查文本内容是否包含要删除的关键字符数组中的任意字符
if any(char in text_content for char in del_text_keywords):
if not change_text:
change_text = True
print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页删除文本框,内容为: '{text_content}'")
# 删除文本框
slide.shapes._spTree.remove(shape._element)
"""
# 判断并替换文本
for keyword in replace_text_keywords:
if keyword in text_content:
if not change_text:
change_text = True
print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页替换文本 '{text_content}''创意素材'")
# 记录原字体样式
original_font = shape.text_frame.paragraphs[0].runs[0].font
new_text = text_content.replace(keyword, "创意素材")
shape.text = new_text
# 应用原字体样式
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
run.font.bold = original_font.bold
run.font.italic = original_font.italic
run.font.underline = original_font.underline
run.font.size = original_font.size
# 处理颜色
if isinstance(original_font.color, RGBColor):
run.font.color.rgb = original_font.color.rgb
elif hasattr(original_font.color, 'theme_color'):
run.font.color.rgb = RGBColor(255, 255, 255)
else:
# 处理未识别的颜色类型
run.font.color.rgb = RGBColor(255, 255, 255)
"""
# 删除记录中的幻灯片 # 删除记录中的幻灯片
for index in slides_to_delete: for index in slides_to_delete:
slides = list(prs.slides._sldIdLst) slides = list(prs.slides._sldIdLst)
current_slide_to_delete = slides[index] current_slide_to_delete = slides[index]
prs.slides._sldIdLst.remove(current_slide_to_delete) prs.slides._sldIdLst.remove(current_slide_to_delete)
if(len(slides_to_delete) > 0): if(len(slides_to_delete) > 0 or change_text):
# 保存修改后的 PowerPoint 文件 # 保存修改后的 PowerPoint 文件
prs.save(pptx_path) prs.save(pptx_path)
else: else:
print(f"{pptx_path}没有版权信息,不需要删除") print(f"{pptx_path}没有版权信息或者版权文案,不需要删除替换")
def process_ppt_files(directory):
copyright_image_paths = get_copyright_image_paths() # 获取版权图片路径列表
# 使用示例 # 遍历指定目录下的所有文件
pptx_file_path = '/Users/minya/Desktop/test/1.pptx' # 输入 PowerPoint 文件路径 for filename in os.listdir(directory):
copyright_image_paths = ['/Users/minya/Desktop/test/1.png'] # 版权图片路径列表 if filename.lower().endswith(('.pptx', '.ppt')): # 只处理 pptx 和 ppt 文件
pptx_file_path = os.path.join(directory, filename)
remove_copyright_images(pptx_file_path, copyright_image_paths)
remove_copyright_images(pptx_file_path, copyright_image_paths) process_ppt_files(Config.get_latest_folder(Config.WORK_PATH))
Loading…
Cancel
Save