Skip to content

Commit

Permalink
feat: dynamic line space
Browse files Browse the repository at this point in the history
  • Loading branch information
timelic committed Jan 8, 2025
1 parent d22bbc6 commit 32128ec
Showing 1 changed file with 85 additions and 18 deletions.
103 changes: 85 additions & 18 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,13 @@ def render_char(


class Paragraph:
def __init__(self, y, x, x0, x1, size, brk):
def __init__(self, y, x, x0, x1, y0, y1, size, brk):
self.y: float = y # 初始纵坐标
self.x: float = x # 初始横坐标
self.x0: float = x0 # 左边界
self.x1: float = x1 # 右边界
self.y0: float = y0 # 上边界
self.y1: float = y1 # 下边界
self.size: float = size # 字体大小
self.brk: bool = brk # 换行标记

Expand Down Expand Up @@ -186,6 +188,8 @@ def receive_layout(self, ltpage: LTPage):
vmax: float = ltpage.width / 4 # 行内公式最大宽度
ops: str = "" # 渲染结果



def vflag(font: str, char: str): # 匹配公式(和角标)字体
if isinstance(font, bytes): # 不一定能 decode,直接转 str
font = str(font)
Expand Down Expand Up @@ -284,7 +288,7 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
pstk[-1].brk = True
else: # 根据当前字符构建一个新的段落
sstk.append("")
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.y0, child.y1, child.size, False))
if not cur_v: # 文字入栈
if ( # 根据当前字符修正段落属性
child.size > pstk[-1].size # 1. 当前字符比段落字体大
Expand All @@ -304,6 +308,8 @@ def vflag(font: str, char: str): # 匹配公式(和角标)字体
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
pstk[-1].y0 = min(pstk[-1].y0, child.y0)
pstk[-1].y1 = max(pstk[-1].y1, child.y1)
# 更新上一个字符
xt = child
xt_cls = cls
Expand Down Expand Up @@ -366,20 +372,36 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
else:
return "".join(["%02x" % ord(c) for c in cstk])

# 根据目标语言获取默认行距
lang_space = {
"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4,
"ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8
}
default_line_spacing = lang_space.get(self.translator.lang_out.lower(), 1.1)

_x, _y = 0, 0

ops_list = []
gen_op_txt = lambda font, size, x, y, rtxt: f"/{font} {size:f} Tf 1 0 0 1 {x:f} {y:f} Tm [<{rtxt}>] TJ "
gen_op_line = lambda x, y, xlen, ylen, linewidth: f"ET q 1 0 0 1 {x:f} {y:f} cm [] 0 d 0 J {linewidth:f} w 0 0 m {xlen:f} {ylen:f} l S Q BT "
for id, new in enumerate(news):
x: float = pstk[id].x # 段落初始横坐标
y: float = pstk[id].y # 段落初始纵坐标
x0: float = pstk[id].x0 # 段落左边界
x1: float = pstk[id].x1 # 段落右边界
size: float = pstk[id].size # 段落字体大小
brk: bool = pstk[id].brk # 段落换行标记
cstk: str = "" # 当前文字栈
fcur: str = None # 当前字体 ID
x: float = pstk[id].x # 段落初始横坐标
y: float = pstk[id].y # 段落初始纵坐标
x0: float = pstk[id].x0 # 段落左边界
x1: float = pstk[id].x1 # 段落右边界
height: float = pstk[id].y1 - pstk[id].y0 # 段落高度
size: float = pstk[id].size # 段落字体大小
brk: bool = pstk[id].brk # 段落换行标记
cstk: str = "" # 当前文字栈
fcur: str = None # 当前字体 ID
line = 0 # 记录换行次数
tx = x
fcur_ = fcur
ptr = 0
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[id]} | {new}")

ops_vals: list[dict] = []

while ptr < len(new):
vy_regex = re.match(
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE
Expand Down Expand Up @@ -415,25 +437,48 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
):
if cstk:
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
ops_vals.append({
"type": "text",
"font": fcur,
"size": size,
"x": tx,
"dy": 0,
"rtxt": raw_string(fcur, cstk),
"line": line
})
cstk = ""
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
x = x0
lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
line += 1
if vy_regex: # 插入公式
fix = 0
if fcur is not None: # 段落内公式修正纵向偏移
fix = varf[vid]
for vch in var[vid]: # 排版公式字符
vc = chr(vch.cid)
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
ops_vals.append({
"type": "text",
"font": self.fontid[vch.font],
"size": vch.size,
"x": x + vch.x0 - var[vid][0].x0,
"dy": fix + vch.y0 - var[vid][0].y0,
"rtxt": raw_string(self.fontid[vch.font], vc),
"line": line
})
if log.isEnabledFor(logging.DEBUG):
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
for l in varl[vid]: # 排版公式线条
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
ops_vals.append({
"type": "formula",
"x": l.pts[0][0] + x - var[vid][0].x0,
"dy": l.pts[0][1] + fix - var[vid][0].y0,
"linewidth": l.linewidth,
"xlen": l.pts[1][0] - l.pts[0][0],
"ylen": l.pts[1][1] - l.pts[0][1],
"line": line
})
else: # 插入文字缓冲区
if not cstk: # 单行开头
tx = x
Expand All @@ -451,9 +496,31 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
_x, _y = x, y
# 处理结尾
if cstk:
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
ops_vals.append({
"type": "text",
"font": fcur,
"size": size,
"x": tx,
"dy": 0,
"rtxt": raw_string(fcur, cstk),
"line": line
})

line_spacing = default_line_spacing

while (line + 1) * size * line_spacing > height and line_spacing >= 1:
line_spacing -= 0.05

for vals in ops_vals:
match vals["type"]:
case "text":
ops_list.append(gen_op_txt(vals["font"], vals["size"], vals["x"], vals["dy"] + y - vals["line"] * size * line_spacing, vals["rtxt"]))
case "formula":
ops_list.append(gen_op_line(vals["x"], vals["dy"] + y - vals["line"] * size * line_spacing, vals["xlen"], vals["ylen"], vals["linewidth"]))

for l in lstk: # 排版全局线条
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
ops = f"BT {ops}ET "
ops_list.append(gen_op_line(l.pts[0][0], l.pts[0][1], l.pts[1][0] - l.pts[0][0], l.pts[1][1] - l.pts[0][1], l.linewidth))

ops = f"BT {''.join(ops_list)}ET "
return ops

0 comments on commit 32128ec

Please sign in to comment.