feat: python识别版权图片和文字

main
lichaojun 1 year ago
parent bcb6b74442
commit d291a847e5

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 842 KiB

@ -10,7 +10,7 @@ const config = {
* 则直接使用inputDir作为工作目录
*/
inputDir: '',
// inputDir: 'E:/商品资料汇总/商品资料(1051-1100)/ppt1056-研究生复试',
// inputDir: 'C:/Users/Administrator/Desktop/test',
// 指定要查找的目录
directoryPath: 'E:/商品资料汇总/商品资料(1201-1250)',
// 要转换的文件个数, 0表示全部转换

@ -1,17 +1,45 @@
# 依赖目录pip install python-pptx Pillow imagehash
from pptx import Presentation
from pptx.util import Inches
from pptx.dml.color import RGBColor
from PIL import Image
import imagehash
import os
from config import Config
def is_similar_image(image_path, reference_image_path, similarity_threshold=0.7):
def get_copyright_image_paths():
# 构建 assets/copyright-img 的相对路径
copyright_dir = os.path.join(
os.path.dirname(__file__), "..", "assets", "copyright-img"
)
# 获取所有图片文件的路径
copyright_image_paths = []
for filename in os.listdir(copyright_dir):
if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')): # 支持的图片格式
copyright_image_paths.append(os.path.join(copyright_dir, filename))
return copyright_image_paths
def convert_image_to_rgba(image):
"""
将图像转换为 RGBA 格式以处理透明度
"""
if image.mode == 'P':
image = image.convert('RGBA')
return image
def is_similar_image(image_path, reference_image_path, similarity_threshold=0.95):
"""
检查两张图片之间的相似度
"""
img1 = Image.open(image_path)
img2 = Image.open(reference_image_path)
# 转换图像为 RGBA 格式
img1 = convert_image_to_rgba(img1)
img2 = convert_image_to_rgba(img2)
# 计算哈希值
hash1 = imagehash.average_hash(img1)
hash2 = imagehash.average_hash(img2)
@ -26,6 +54,11 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 使用一个列表来记录要删除的幻灯片索引
slides_to_delete = []
change_text = False
# 定义要替换的关键字数组
replace_text_keywords = ["第一PPT"]
# 定义要删除的关键字符数组
del_text_keywords = ["Speaker name and title", "OfficePLUS", "Presenter name", "www.officeplus.cn"]
# 遍历每一页幻灯片
for index, slide in enumerate(prs.slides):
@ -41,7 +74,7 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 检查相似度
for copyright_image in copyright_image_paths:
if is_similar_image(temp_image_path, copyright_image):
print(f"有相似的图片,在第{index + 1}")
print(f"'{os.path.basename(pptx_path)}'有相似的图片'{os.path.basename(copyright_image)}',在第{index + 1}")
# 删除该形状
slide.shapes._spTree.remove(shape._element)
@ -51,20 +84,69 @@ def remove_copyright_images(pptx_path, copyright_image_paths):
# 删除临时图片
os.remove(temp_image_path)
# 检查形状是否为文本框
if shape.has_text_frame:
text_content = shape.text
# 检查文本内容是否包含要删除的关键字符数组中的任意字符
if any(char in text_content for char in del_text_keywords):
if not change_text:
change_text = True
print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页删除文本框,内容为: '{text_content}'")
# 删除文本框
slide.shapes._spTree.remove(shape._element)
"""
# 判断并替换文本
for keyword in replace_text_keywords:
if keyword in text_content:
if not change_text:
change_text = True
print(f"'{os.path.basename(pptx_path)}'在第 {index + 1} 页替换文本 '{text_content}''创意素材'")
# 记录原字体样式
original_font = shape.text_frame.paragraphs[0].runs[0].font
new_text = text_content.replace(keyword, "创意素材")
shape.text = new_text
# 应用原字体样式
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
run.font.bold = original_font.bold
run.font.italic = original_font.italic
run.font.underline = original_font.underline
run.font.size = original_font.size
# 处理颜色
if isinstance(original_font.color, RGBColor):
run.font.color.rgb = original_font.color.rgb
elif hasattr(original_font.color, 'theme_color'):
run.font.color.rgb = RGBColor(255, 255, 255)
else:
# 处理未识别的颜色类型
run.font.color.rgb = RGBColor(255, 255, 255)
"""
# 删除记录中的幻灯片
for index in slides_to_delete:
slides = list(prs.slides._sldIdLst)
current_slide_to_delete = slides[index]
prs.slides._sldIdLst.remove(current_slide_to_delete)
if(len(slides_to_delete) > 0):
if(len(slides_to_delete) > 0 or change_text):
# 保存修改后的 PowerPoint 文件
prs.save(pptx_path)
else:
print(f"{pptx_path}没有版权信息,不需要删除")
print(f"{pptx_path}没有版权信息或者版权文案,不需要删除替换")
def process_ppt_files(directory):
copyright_image_paths = get_copyright_image_paths() # 获取版权图片路径列表
# 使用示例
pptx_file_path = '/Users/minya/Desktop/test/1.pptx' # 输入 PowerPoint 文件路径
copyright_image_paths = ['/Users/minya/Desktop/test/1.png'] # 版权图片路径列表
# 遍历指定目录下的所有文件
for filename in os.listdir(directory):
if filename.lower().endswith(('.pptx', '.ppt')): # 只处理 pptx 和 ppt 文件
pptx_file_path = os.path.join(directory, filename)
remove_copyright_images(pptx_file_path, copyright_image_paths)
remove_copyright_images(pptx_file_path, copyright_image_paths)
process_ppt_files(Config.get_latest_folder(Config.WORK_PATH))
Loading…
Cancel
Save