import re import json def extract_citations(file_path): # 用于存储找到的所有引用数字 citations = [] try: # 打开文件并读取内容 with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # 使用正则表达式匹配引用中的数字,假设引用格式为 [数字][数字] # \d+ 匹配一个或多个数字 citation_matches = re.findall(r'\[\d+\]', content) # 从匹配到的形如'[数字]'的字符串中提取出数字部分 citations = [int(match.strip('[]')) for match in citation_matches] # 如果需要去重并排序,可以在这里做 citations = sorted(set(citations)) except FileNotFoundError: print(f"文件 {file_path} 未找到。") except Exception as e: print(f"读取文件时发生错误:{e}") return citations def load_map(path): with open(path, 'r', encoding='utf-8') as file: data = json.load(file) mymap = {} for url in data['url_to_unified_index'].keys(): index = data['url_to_unified_index'][url] mymap[index] = url return mymap def remove_lines_after_marker(content: str, marker: str = '---') -> str: """ Remove the line immediately following the marker in the content. """ lines = content.splitlines() updated_lines = [] skip_next_line = False for line in lines: if skip_next_line: skip_next_line = False continue if line.strip() == marker: skip_next_line = True else: updated_lines.append(line) return '\n'.join(updated_lines) def add_ref(citations, mymap, content): references = "\n\n参考文献:\n" for citation in citations: if citation in mymap: references += f"[{citation}] {mymap[citation]}\n" else: references += f"[{citation}] 未找到对应的URL\n" # 将参考文献附加到文章末尾 content += references return content def polish(article_path, map_path): # 提取文章中的引用数字 citations = extract_citations(article_path) # 加载索引到URL的映射 mymap = load_map(map_path) with open(article_path, 'r', encoding='utf-8') as file: content = file.read() content = add_ref(citations, mymap, content) content = remove_lines_after_marker(content, marker='---') content = remove_consecutive_duplicate_citations(content) with open(article_path, 'w', encoding='utf-8') as file: file.write(content) print("参考文献已成功附加到文章末尾。") def remove_consecutive_duplicate_citations(content: str) -> str: """ This function removes consecutive duplicate citations in the text within the same line, deleting earlier instances and keeping only the last one when duplicates are adjacent. """ # Split the content into lines lines = content.splitlines() processed_lines = [] for line in lines: # Find all citations and split them from the text parts = re.split(r'(\[\d+\])', line) # List to hold the new parts after removing duplicates new_parts = [] last_citation = None last_citation_index = -1 for index, part in enumerate(parts): if re.match(r'\[\d+\]', part): if part == last_citation: # If the current citation is the same as the last, remove the last one new_parts.pop(last_citation_index) last_citation = part last_citation_index = len(new_parts) new_parts.append(part) # Reconstruct the line, ensuring we remove any trailing empty parts new_line = ''.join([p for p in new_parts if p != '']) processed_lines.append(new_line) # Join the processed lines back into a single string return '\n'.join(processed_lines) # 设置路径 def post_polish(path): article_path = path + '/storm_gen_article_polished.txt' map_path = path + '/url_to_info.json' polish(article_path, map_path) if __name__ == '__main__': # path = '/mnt/nas-alinlp/xizekun/project/storm/results/gpt/台风玛莉亚' # article_path = path + '/storm_gen_article_polished.txt' # map_path = path + '/url_to_info.json' # # 调用函数将参考文献添加到文章末尾 # append_references_to_article(article_path, map_path) string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]" post_string = remove_consecutive_duplicate_citations(string) print(post_string)