|
import re |
|
import json |
|
|
|
def extract_citations(file_path): |
|
|
|
citations = [] |
|
|
|
try: |
|
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
content = file.read() |
|
|
|
|
|
|
|
citation_matches = re.findall(r'\[\d+\]', content) |
|
|
|
|
|
citations = [int(match.strip('[]')) for match in citation_matches] |
|
|
|
|
|
citations = sorted(set(citations)) |
|
|
|
except FileNotFoundError: |
|
print(f"文件 {file_path} 未找到。") |
|
except Exception as e: |
|
print(f"读取文件时发生错误:{e}") |
|
|
|
return citations |
|
|
|
def load_map(path): |
|
with open(path, 'r', encoding='utf-8') as file: |
|
data = json.load(file) |
|
mymap = {} |
|
for url in data['url_to_unified_index'].keys(): |
|
index = data['url_to_unified_index'][url] |
|
mymap[index] = url |
|
return mymap |
|
|
|
def remove_lines_after_marker(content: str, marker: str = '---') -> str: |
|
""" |
|
Remove the line immediately following the marker in the content. |
|
""" |
|
lines = content.splitlines() |
|
updated_lines = [] |
|
skip_next_line = False |
|
|
|
for line in lines: |
|
if skip_next_line: |
|
skip_next_line = False |
|
continue |
|
if line.strip() == marker: |
|
skip_next_line = True |
|
else: |
|
updated_lines.append(line) |
|
|
|
return '\n'.join(updated_lines) |
|
|
|
def add_ref(citations, mymap, content): |
|
references = "\n\n参考文献:\n" |
|
for citation in citations: |
|
if citation in mymap: |
|
references += f"[{citation}] {mymap[citation]}\n" |
|
else: |
|
references += f"[{citation}] 未找到对应的URL\n" |
|
|
|
|
|
content += references |
|
return content |
|
|
|
def polish(article_path, map_path): |
|
|
|
citations = extract_citations(article_path) |
|
|
|
mymap = load_map(map_path) |
|
|
|
with open(article_path, 'r', encoding='utf-8') as file: |
|
content = file.read() |
|
|
|
content = add_ref(citations, mymap, content) |
|
content = remove_lines_after_marker(content, marker='---') |
|
content = remove_consecutive_duplicate_citations(content) |
|
|
|
with open(article_path, 'w', encoding='utf-8') as file: |
|
file.write(content) |
|
print("参考文献已成功附加到文章末尾。") |
|
|
|
|
|
def remove_consecutive_duplicate_citations(content: str) -> str: |
|
""" |
|
This function removes consecutive duplicate citations in the text within the same line, |
|
deleting earlier instances and keeping only the last one when duplicates are adjacent. |
|
""" |
|
|
|
lines = content.splitlines() |
|
|
|
processed_lines = [] |
|
|
|
for line in lines: |
|
|
|
parts = re.split(r'(\[\d+\])', line) |
|
|
|
|
|
new_parts = [] |
|
last_citation = None |
|
last_citation_index = -1 |
|
|
|
for index, part in enumerate(parts): |
|
if re.match(r'\[\d+\]', part): |
|
if part == last_citation: |
|
|
|
new_parts.pop(last_citation_index) |
|
last_citation = part |
|
last_citation_index = len(new_parts) |
|
new_parts.append(part) |
|
|
|
|
|
new_line = ''.join([p for p in new_parts if p != '']) |
|
processed_lines.append(new_line) |
|
|
|
|
|
return '\n'.join(processed_lines) |
|
|
|
|
|
|
|
def post_polish(path): |
|
article_path = path + '/storm_gen_article_polished.txt' |
|
map_path = path + '/url_to_info.json' |
|
polish(article_path, map_path) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]" |
|
post_string = remove_consecutive_duplicate_citations(string) |
|
print(post_string) |
|
|