File size: 4,843 Bytes
80a598c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import re
import json
def extract_citations(file_path):
# 用于存储找到的所有引用数字
citations = []
try:
# 打开文件并读取内容
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# 使用正则表达式匹配引用中的数字,假设引用格式为 [数字][数字]
# \d+ 匹配一个或多个数字
citation_matches = re.findall(r'\[\d+\]', content)
# 从匹配到的形如'[数字]'的字符串中提取出数字部分
citations = [int(match.strip('[]')) for match in citation_matches]
# 如果需要去重并排序,可以在这里做
citations = sorted(set(citations))
except FileNotFoundError:
print(f"文件 {file_path} 未找到。")
except Exception as e:
print(f"读取文件时发生错误:{e}")
return citations
def load_map(path):
with open(path, 'r', encoding='utf-8') as file:
data = json.load(file)
mymap = {}
for url in data['url_to_unified_index'].keys():
index = data['url_to_unified_index'][url]
mymap[index] = url
return mymap
def remove_lines_after_marker(content: str, marker: str = '---') -> str:
"""
Remove the line immediately following the marker in the content.
"""
lines = content.splitlines()
updated_lines = []
skip_next_line = False
for line in lines:
if skip_next_line:
skip_next_line = False
continue
if line.strip() == marker:
skip_next_line = True
else:
updated_lines.append(line)
return '\n'.join(updated_lines)
def add_ref(citations, mymap, content):
references = "\n\n参考文献:\n"
for citation in citations:
if citation in mymap:
references += f"[{citation}] {mymap[citation]}\n"
else:
references += f"[{citation}] 未找到对应的URL\n"
# 将参考文献附加到文章末尾
content += references
return content
def polish(article_path, map_path):
# 提取文章中的引用数字
citations = extract_citations(article_path)
# 加载索引到URL的映射
mymap = load_map(map_path)
with open(article_path, 'r', encoding='utf-8') as file:
content = file.read()
content = add_ref(citations, mymap, content)
content = remove_lines_after_marker(content, marker='---')
content = remove_consecutive_duplicate_citations(content)
with open(article_path, 'w', encoding='utf-8') as file:
file.write(content)
print("参考文献已成功附加到文章末尾。")
def remove_consecutive_duplicate_citations(content: str) -> str:
"""
This function removes consecutive duplicate citations in the text within the same line,
deleting earlier instances and keeping only the last one when duplicates are adjacent.
"""
# Split the content into lines
lines = content.splitlines()
processed_lines = []
for line in lines:
# Find all citations and split them from the text
parts = re.split(r'(\[\d+\])', line)
# List to hold the new parts after removing duplicates
new_parts = []
last_citation = None
last_citation_index = -1
for index, part in enumerate(parts):
if re.match(r'\[\d+\]', part):
if part == last_citation:
# If the current citation is the same as the last, remove the last one
new_parts.pop(last_citation_index)
last_citation = part
last_citation_index = len(new_parts)
new_parts.append(part)
# Reconstruct the line, ensuring we remove any trailing empty parts
new_line = ''.join([p for p in new_parts if p != ''])
processed_lines.append(new_line)
# Join the processed lines back into a single string
return '\n'.join(processed_lines)
# 设置路径
def post_polish(path):
article_path = path + '/storm_gen_article_polished.txt'
map_path = path + '/url_to_info.json'
polish(article_path, map_path)
if __name__ == '__main__':
# path = '/mnt/nas-alinlp/xizekun/project/storm/results/gpt/台风玛莉亚'
# article_path = path + '/storm_gen_article_polished.txt'
# map_path = path + '/url_to_info.json'
# # 调用函数将参考文献添加到文章末尾
# append_references_to_article(article_path, map_path)
string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]"
post_string = remove_consecutive_duplicate_citations(string)
print(post_string)
|