Spaces:

zjunlp
/

OmniThink

Running

File size: 4,843 Bytes

80a598c

import re
import json

def extract_citations(file_path):
    # 用于存储找到的所有引用数字
    citations = []
    
    try:
        # 打开文件并读取内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # 使用正则表达式匹配引用中的数字，假设引用格式为 [数字][数字]
            # \d+ 匹配一个或多个数字
            citation_matches = re.findall(r'\[\d+\]', content)
            
            # 从匹配到的形如'[数字]'的字符串中提取出数字部分
            citations = [int(match.strip('[]')) for match in citation_matches]
            
            # 如果需要去重并排序，可以在这里做
            citations = sorted(set(citations))
            
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except Exception as e:
        print(f"读取文件时发生错误：{e}")
    
    return citations

def load_map(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    mymap = {}
    for url in data['url_to_unified_index'].keys():
        index = data['url_to_unified_index'][url]
        mymap[index] = url
    return mymap

def remove_lines_after_marker(content: str, marker: str = '---') -> str:
    """
    Remove the line immediately following the marker in the content.
    """
    lines = content.splitlines()
    updated_lines = []
    skip_next_line = False
    
    for line in lines:
        if skip_next_line:
            skip_next_line = False
            continue
        if line.strip() == marker:
            skip_next_line = True
        else:
            updated_lines.append(line)
    
    return '\n'.join(updated_lines)

def add_ref(citations, mymap, content):
    references = "\n\n参考文献:\n"
    for citation in citations:
        if citation in mymap:
            references += f"[{citation}] {mymap[citation]}\n"
        else:
            references += f"[{citation}] 未找到对应的URL\n"
    
    # 将参考文献附加到文章末尾
    content += references
    return content

def polish(article_path, map_path):
    # 提取文章中的引用数字
    citations = extract_citations(article_path)
    # 加载索引到URL的映射
    mymap = load_map(map_path)

    with open(article_path, 'r', encoding='utf-8') as file:
        content = file.read()
  
    content = add_ref(citations, mymap, content)
    content = remove_lines_after_marker(content, marker='---')
    content = remove_consecutive_duplicate_citations(content)
        
    with open(article_path, 'w', encoding='utf-8') as file:
        file.write(content)    
        print("参考文献已成功附加到文章末尾。")
    

def remove_consecutive_duplicate_citations(content: str) -> str:
    """
    This function removes consecutive duplicate citations in the text within the same line,
    deleting earlier instances and keeping only the last one when duplicates are adjacent.
    """
    # Split the content into lines
    lines = content.splitlines()
    
    processed_lines = []
    
    for line in lines:
        # Find all citations and split them from the text
        parts = re.split(r'(\[\d+\])', line)
        
        # List to hold the new parts after removing duplicates
        new_parts = []
        last_citation = None
        last_citation_index = -1
        
        for index, part in enumerate(parts):
            if re.match(r'\[\d+\]', part):
                if part == last_citation:
                    # If the current citation is the same as the last, remove the last one
                    new_parts.pop(last_citation_index)
                last_citation = part
                last_citation_index = len(new_parts)
            new_parts.append(part)
        
        # Reconstruct the line, ensuring we remove any trailing empty parts
        new_line = ''.join([p for p in new_parts if p != ''])
        processed_lines.append(new_line)
    
    # Join the processed lines back into a single string
    return '\n'.join(processed_lines)


# 设置路径
def post_polish(path):
    article_path = path + '/storm_gen_article_polished.txt'
    map_path = path + '/url_to_info.json' 
    polish(article_path, map_path)
  

if __name__ == '__main__':
    # path = '/mnt/nas-alinlp/xizekun/project/storm/results/gpt/台风玛莉亚'
    # article_path = path + '/storm_gen_article_polished.txt'
    # map_path = path + '/url_to_info.json'
    # # 调用函数将参考文献添加到文章末尾
    # append_references_to_article(article_path, map_path)


    string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]"
    post_string = remove_consecutive_duplicate_citations(string)
    print(post_string)