You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

497 lines
19 KiB
HTML

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta
name="viewport"
content="width=device-width, initial-scale=1.0"
/>
<title>完全本地化OCR识别工具</title>
<style>
body {
font-family: Arial, sans-serif;
max-width: 800px;
margin: 0 auto;
padding: 20px;
line-height: 1.6;
background-color: #f5f5f5;
}
h1 {
color: #2c3e50;
text-align: center;
margin-bottom: 30px;
}
.container {
background-color: white;
padding: 25px;
border-radius: 8px;
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
}
.upload-section {
border: 2px dashed #7f8c8d;
padding: 30px;
text-align: center;
border-radius: 5px;
background-color: #f9f9f9;
transition: all 0.3s;
margin-bottom: 20px;
}
.upload-section:hover {
border-color: #3498db;
background-color: #f0f7fc;
}
#imagePreview {
max-width: 100%;
max-height: 300px;
margin-top: 15px;
display: none;
border-radius: 4px;
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.2);
}
.btn {
background-color: #3498db;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s;
}
.btn:hover {
background-color: #2980b9;
}
.progress-container {
margin: 20px 0;
display: none;
}
.progress-bar {
height: 20px;
background-color: #ecf0f1;
border-radius: 4px;
overflow: hidden;
}
.progress-fill {
height: 100%;
background-color: #2ecc71;
width: 0%;
transition: width 0.3s;
}
#statusText {
margin-top: 8px;
font-size: 14px;
color: #7f8c8d;
text-align: center;
}
.result-section {
margin-top: 30px;
display: none;
}
.header-item {
display: flex;
align-items: center;
margin-bottom: 12px;
padding: 12px;
background-color: #f8f9fa;
border-radius: 4px;
border-left: 4px solid #3498db;
}
.header-label {
font-weight: bold;
min-width: 200px;
color: #2c3e50;
}
.header-value {
flex-grow: 1;
word-break: break-all;
padding: 0 15px;
font-family: monospace;
}
.copy-btn {
background-color: #27ae60;
color: white;
border: none;
padding: 6px 12px;
border-radius: 3px;
cursor: pointer;
font-size: 14px;
transition: all 0.2s;
}
.copy-btn:hover {
background-color: #219653;
}
.copy-btn:active {
transform: scale(0.95);
}
#ocrTextResult {
white-space: pre-wrap;
background-color: #f8f9fa;
padding: 15px;
border-radius: 4px;
font-family: monospace;
max-height: 300px;
overflow-y: auto;
border: 1px solid #ddd;
}
.section-title {
color: #2c3e50;
border-bottom: 2px solid #3498db;
padding-bottom: 8px;
margin-top: 25px;
}
</style>
</head>
<body>
<div class="container">
<h1>完全本地化OCR识别工具</h1>
<div
class="upload-section"
id="dropArea"
>
<p style="font-size: 18px; margin-bottom: 20px">拖放图片到此处或点击下方按钮选择</p>
<input
type="file"
id="fileInput"
accept="image/*"
style="display: none"
/>
<button
class="btn"
onclick="document.getElementById('fileInput').click()"
>
选择图片文件
</button>
<img
id="imagePreview"
alt="图片预览"
/>
</div>
<div
class="progress-container"
id="progressContainer"
>
<div class="progress-bar">
<div
class="progress-fill"
id="progressFill"
></div>
</div>
<div id="statusText">准备就绪</div>
</div>
<div
class="result-section"
id="resultSection"
>
<h2 class="section-title">OCR识别结果</h2>
<div id="ocrTextResult"></div>
<h2 class="section-title">提取的Header信息</h2>
<div id="headerResults"></div>
</div>
</div>
<!-- 本地化Tesseract.js库 -->
<script src="js/tesseract.min.js"></script>
<script>
// 初始化Tesseract配置
function initTesseract() {
// 检测SIMD支持
const simdSupported = (() => {
try {
return WebAssembly.validate(
new Uint8Array([
0, 97, 115, 109, 1, 0, 0, 0, 1, 5, 1, 96, 0, 1, 123, 3, 2, 1, 0, 10, 10, 1, 8, 0, 65, 0, 253, 15, 253, 98, 11,
])
);
} catch (e) {
return false;
}
})();
console.log('SIMD支持:', simdSupported ? '是' : '否');
console.log(Tesseract);
// 配置Tesseract
window.Tesseract = Tesseract.create({
workerPath: 'worker/worker.min.js',
corePath: simdSupported ? 'js/tesseract-core-simd.wasm.js' : 'js/tesseract-core.wasm.js',
langPath: 'lang/',
workerBlobURL: false,
cacheMethod: 'indexedDB',
cachePath: 'tesseract-cache',
gzip: false,
logger: (m) => console.log(m),
});
}
// 页面加载完成后初始化
document.addEventListener('DOMContentLoaded', function () {
// initTesseract();
setupEventListeners();
});
// 设置事件监听器
function setupEventListeners() {
const fileInput = document.getElementById('fileInput');
const dropArea = document.getElementById('dropArea');
fileInput.addEventListener('change', handleFileSelect);
// 拖放相关事件
dropArea.addEventListener('dragover', function (e) {
e.preventDefault();
e.stopPropagation();
this.style.borderColor = '#3498db';
this.style.backgroundColor = '#eaf2f8';
});
dropArea.addEventListener('dragleave', function (e) {
e.preventDefault();
e.stopPropagation();
this.style.borderColor = '#7f8c8d';
this.style.backgroundColor = '#f9f9f9';
});
dropArea.addEventListener('drop', function (e) {
e.preventDefault();
e.stopPropagation();
this.style.borderColor = '#7f8c8d';
this.style.backgroundColor = '#f9f9f9';
const file = e.dataTransfer.files[0];
if (file && file.type.match('image.*')) {
processImageFile(file);
}
});
}
// 处理文件选择
function handleFileSelect(e) {
const file = e.target.files[0];
if (file && file.type.match('image.*')) {
processImageFile(file);
}
}
// 处理图片文件
function processImageFile(file) {
// 显示预览
const reader = new FileReader();
reader.onload = function (e) {
const preview = document.getElementById('imagePreview');
preview.src = e.target.result;
preview.style.display = 'block';
};
reader.readAsDataURL(file);
// 开始OCR处理
performOCR(file);
}
// 执行OCR识别
async function performOCR(imageFile) {
const progressContainer = document.getElementById('progressContainer');
const progressFill = document.getElementById('progressFill');
const statusText = document.getElementById('statusText');
const resultSection = document.getElementById('resultSection');
const ocrTextResult = document.getElementById('ocrTextResult');
const headerResults = document.getElementById('headerResults');
// 重置UI
progressContainer.style.display = 'block';
progressFill.style.width = '0%';
statusText.textContent = '正在初始化OCR引擎...';
resultSection.style.display = 'none';
headerResults.innerHTML = '';
try {
// 创建worker
const worker = await Tesseract.createWorker({
workerPath: 'worker/worker.min.js',
corePath: 'js/tesseract-core-simd.wasm.js',
langPath: 'lang/',
workerBlobURL: false,
cacheMethod: 'indexedDB',
cachePath: 'tesseract-cache',
gzip: false,
logger: (m) => updateProgress(m),
});
// 加载语言
await worker.loadLanguage('chi_sim+eng');
await worker.initialize('chi_sim+eng');
// 执行OCR识别
const result = await worker.recognize(imageFile);
// 显示结果
ocrTextResult.textContent = result.data.text;
resultSection.style.display = 'block';
// 提取header信息
const headers = extractHeadersFromOCR(result.data.text);
displayHeaders(headers);
statusText.textContent = '识别完成!';
progressFill.style.width = '100%';
// 清理worker
await worker.terminate();
} catch (error) {
console.error('OCR处理错误:', error);
statusText.textContent = `识别失败: ${error.message}`;
progressFill.style.backgroundColor = '#e74c3c';
}
}
// 更新进度
function updateProgress(message) {
const progressFill = document.getElementById('progressFill');
const statusText = document.getElementById('statusText');
if (message.status === 'recognizing text') {
progressFill.style.width = `${message.progress * 100}%`;
}
if (message.status) {
let statusMessage = message.status;
if (message.progress) {
statusMessage += ` (${Math.round(message.progress * 100)}%)`;
}
statusText.textContent = statusMessage;
}
}
// 从OCR文本中提取header信息
function extractHeadersFromOCR(text) {
// 定义要提取的header键及其可能的OCR识别变体
const headerKeys = {
bkatimestamp: ['bkatimestamp', 'bkatimestarnp', 'bkatirnestamp'],
bkatimestamptoken: ['bkatimestamptoken', 'pkatimestamptoken', 'bkatimestamptoke'],
brcpEaSessionTicket: ['brcpEaSessionTicket', 'brepEaSessionTicket', 'brcpEaSessionTiket'],
brcpEaDeviceId: ['brcpEaDeviceId', 'brcpEaDeviceld', 'brcpEaDeviceid'],
};
// 初始化结果对象
const results = {};
Object.keys(headerKeys).forEach((key) => {
results[key] = '';
});
// 预处理文本合并换行、替换常见OCR错误
const preprocessedText = text
.replace(/\r?\n|\r/g, ' ') // 替换所有换行为空格
.replace(/\s+/g, ' ') // 合并多个空格为一个
.replace(/[“”]/g, '"') // 统一引号
.replace(/[]/g, "'")
.replace(/l(?=[a-zA-Z])/g, 'I') // 修复"I"被识别为"l"的问题
.replace(/%/g, 'a') // 修复"a"被识别为"%"的问题
.replace(/[¢€]/g, 'c'); // 修复"c"被识别为特殊字符的问题
// 对每个header键进行匹配
for (const [correctKey, possibleVariants] of Object.entries(headerKeys)) {
for (const variant of possibleVariants) {
if (results[correctKey]) break; // 如果已经找到,跳过
// 构造更灵活的正则表达式
const regex = new RegExp(
`(${variant})` + // 键名
`\\s*[:=]\\s*` + // 分隔符(:或=),前后可能有空格
`["']?` + // 可能的值引号
`([^"'\s\\],;}{]+)` + // 值部分(直到遇到引号、空格或各种分隔符)
`["']?`, // 可能的值引号
'i' // 不区分大小写
);
const match = preprocessedText.match(regex);
if (match && match[2]) {
// 清理提取的值
let value = match[2]
.replace(/^["']|["']$/g, '') // 去除首尾引号
.replace(/\s/g, '') // 去除内部空格
.replace(/[^\w-]/g, ''); // 只保留字母数字、下划线和连字符
// 基本格式验证
if (validateHeaderValue(correctKey, value)) {
results[correctKey] = value;
}
}
}
}
return results;
}
// 验证提取的header值是否符合预期格式
function validateHeaderValue(key, value) {
if (!value) return false;
switch (key) {
case 'bkatimestamp':
// 时间戳通常是13位数字
return /^\d{13}$/.test(value);
case 'bkatimestamptoken':
// token通常是较长的字母数字组合
return value.length > 30 && /^[a-fA-F0-9]+$/.test(value);
case 'brcpEaSessionTicket':
// ticket通常以字母开头包含字母数字
return /^[a-zA-Z][a-zA-Z0-9]+$/.test(value);
case 'brcpEaDeviceId':
// deviceId通常是字母数字组合
return /^[a-fA-F0-9]+$/.test(value);
default:
return true;
}
}
// 显示提取的header信息
function displayHeaders(headers) {
const headerResults = document.getElementById('headerResults');
headerResults.innerHTML = '';
for (const [key, value] of Object.entries(headers)) {
const itemDiv = document.createElement('div');
itemDiv.className = 'header-item';
const labelSpan = document.createElement('span');
labelSpan.className = 'header-label';
labelSpan.textContent = key;
const valueSpan = document.createElement('span');
valueSpan.className = 'header-value';
valueSpan.textContent = value || '未识别到该字段';
const copyBtn = document.createElement('button');
copyBtn.className = 'copy-btn';
copyBtn.textContent = '复制';
copyBtn.onclick = () => {
if (value) {
navigator.clipboard
.writeText(value)
.then(() => {
copyBtn.textContent = '已复制!';
setTimeout(() => {
copyBtn.textContent = '复制';
}, 2000);
})
.catch((err) => {
console.error('复制失败:', err);
copyBtn.textContent = '复制失败';
});
}
};
itemDiv.appendChild(labelSpan);
itemDiv.appendChild(valueSpan);
itemDiv.appendChild(copyBtn);
headerResults.appendChild(itemDiv);
}
}
</script>
</body>
</html>