获取文本,按句整理
const getLongContentFromPage = (pageContentCollection: PageContentCollection) => {
const endRegs = /[\.\!!。]/
let longContextList: Array<LONG_CONTEXT_TYPE> = []
_.map(pageContentCollection, pageContent => {
const { page, contentList } = pageContent || {}
let longContext: LONG_CONTEXT_TYPE = {
page,
textlines: [],
}
let thisLine = '',
lastTransformString = ''
_.map(contentList, (content, contentIndex: number) => {
const { transform, str } = content || {}
const sameLineTrans = transform
.slice(0, 4)
.concat([transform[transform.length - 1]])
.join(',')
if (!lastTransformString || sameLineTrans == lastTransformString) {
thisLine += str
if (contentIndex == contentList.length - 1) {
longContext.textlines.push(thisLine)
}
} else {
longContext.textlines.push(thisLine)
thisLine = str
}
lastTransformString = sameLineTrans
})
longContextList.push({
...longContext,
})
})
// sort by page number
longContextList = _.sortBy(longContextList, ['page'])
// get sentence list
let sentence = '',
longParagraph = '',
pageList: number[] = [],
flattenSentenceList: {sentence: string, pageList: number[]}[] = [];
_.map(longContextList, longContext => {
const { textlines, page } = longContext || {}
pageList.push(page)
_.map(textlines, (textline, lineIndex) => {
const is_end_index = textline.match(endRegs)?.index || -1
if (is_end_index >= 0) {
// TODO 根据段落来分隔更好
const endSentence = sentence + textline.slice(0, is_end_index + 1)
flattenSentenceList.push({
sentence: endSentence,
pageList: [...pageList],
})
longParagraph += `\n${endSentence}`
sentence = textline.slice(is_end_index + 1)
// pageList 重新计算
// 如果是当页的最后一行,而且句子在此行末尾结束,下一句为新的一页,则不要把当页加入pageList。
pageList = lineIndex + 1 == textlines.length && is_end_index + 1 == textline.length ? [] : [page]
} else {
sentence += textline
}
})
})
// 补上最后一句
if (sentence.length) {
flattenSentenceList.push({
sentence,
pageList: [...pageList],
})
}
console.log(`flattenSentenceList`, flattenSentenceList)
return { longContextList, flattenSentenceList, longParagraph }
}
最后更新于