OmniThink / src /post.py
ZekunXi's picture
push
80a598c
import re
import json
def extract_citations(file_path):
# 用于存储找到的所有引用数字
citations = []
try:
# 打开文件并读取内容
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 使用正则表达式匹配引用中的数字,假设引用格式为 [数字][数字]
# \d+ 匹配一个或多个数字
citation_matches = re.findall(r'\[\d+\]', content)
# 从匹配到的形如'[数字]'的字符串中提取出数字部分
citations = [int(match.strip('[]')) for match in citation_matches]
# 如果需要去重并排序,可以在这里做
citations = sorted(set(citations))
except FileNotFoundError:
print(f"文件 {file_path} 未找到。")
except Exception as e:
print(f"读取文件时发生错误:{e}")
return citations
def load_map(path):
with open(path, 'r', encoding='utf-8') as file:
data = json.load(file)
mymap = {}
for url in data['url_to_unified_index'].keys():
index = data['url_to_unified_index'][url]
mymap[index] = url
return mymap
def remove_lines_after_marker(content: str, marker: str = '---') -> str:
"""
Remove the line immediately following the marker in the content.
"""
lines = content.splitlines()
updated_lines = []
skip_next_line = False
for line in lines:
if skip_next_line:
skip_next_line = False
continue
if line.strip() == marker:
skip_next_line = True
else:
updated_lines.append(line)
return '\n'.join(updated_lines)
def add_ref(citations, mymap, content):
references = "\n\n参考文献:\n"
for citation in citations:
if citation in mymap:
references += f"[{citation}] {mymap[citation]}\n"
else:
references += f"[{citation}] 未找到对应的URL\n"
# 将参考文献附加到文章末尾
content += references
return content
def polish(article_path, map_path):
# 提取文章中的引用数字
citations = extract_citations(article_path)
# 加载索引到URL的映射
mymap = load_map(map_path)
with open(article_path, 'r', encoding='utf-8') as file:
content = file.read()
content = add_ref(citations, mymap, content)
content = remove_lines_after_marker(content, marker='---')
content = remove_consecutive_duplicate_citations(content)
with open(article_path, 'w', encoding='utf-8') as file:
file.write(content)
print("参考文献已成功附加到文章末尾。")
def remove_consecutive_duplicate_citations(content: str) -> str:
"""
This function removes consecutive duplicate citations in the text within the same line,
deleting earlier instances and keeping only the last one when duplicates are adjacent.
"""
# Split the content into lines
lines = content.splitlines()
processed_lines = []
for line in lines:
# Find all citations and split them from the text
parts = re.split(r'(\[\d+\])', line)
# List to hold the new parts after removing duplicates
new_parts = []
last_citation = None
last_citation_index = -1
for index, part in enumerate(parts):
if re.match(r'\[\d+\]', part):
if part == last_citation:
# If the current citation is the same as the last, remove the last one
new_parts.pop(last_citation_index)
last_citation = part
last_citation_index = len(new_parts)
new_parts.append(part)
# Reconstruct the line, ensuring we remove any trailing empty parts
new_line = ''.join([p for p in new_parts if p != ''])
processed_lines.append(new_line)
# Join the processed lines back into a single string
return '\n'.join(processed_lines)
# 设置路径
def post_polish(path):
article_path = path + '/storm_gen_article_polished.txt'
map_path = path + '/url_to_info.json'
polish(article_path, map_path)
if __name__ == '__main__':
# path = '/mnt/nas-alinlp/xizekun/project/storm/results/gpt/台风玛莉亚'
# article_path = path + '/storm_gen_article_polished.txt'
# map_path = path + '/url_to_info.json'
# # 调用函数将参考文献添加到文章末尾
# append_references_to_article(article_path, map_path)
string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]"
post_string = remove_consecutive_duplicate_citations(string)
print(post_string)