File size: 4,843 Bytes
80a598c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import re
import json

def extract_citations(file_path):
    # 用于存储找到的所有引用数字
    citations = []
    
    try:
        # 打开文件并读取内容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # 使用正则表达式匹配引用中的数字,假设引用格式为 [数字][数字]
            # \d+ 匹配一个或多个数字
            citation_matches = re.findall(r'\[\d+\]', content)
            
            # 从匹配到的形如'[数字]'的字符串中提取出数字部分
            citations = [int(match.strip('[]')) for match in citation_matches]
            
            # 如果需要去重并排序,可以在这里做
            citations = sorted(set(citations))
            
    except FileNotFoundError:
        print(f"文件 {file_path} 未找到。")
    except Exception as e:
        print(f"读取文件时发生错误:{e}")
    
    return citations

def load_map(path):
    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    mymap = {}
    for url in data['url_to_unified_index'].keys():
        index = data['url_to_unified_index'][url]
        mymap[index] = url
    return mymap

def remove_lines_after_marker(content: str, marker: str = '---') -> str:
    """
    Remove the line immediately following the marker in the content.
    """
    lines = content.splitlines()
    updated_lines = []
    skip_next_line = False
    
    for line in lines:
        if skip_next_line:
            skip_next_line = False
            continue
        if line.strip() == marker:
            skip_next_line = True
        else:
            updated_lines.append(line)
    
    return '\n'.join(updated_lines)

def add_ref(citations, mymap, content):
    references = "\n\n参考文献:\n"
    for citation in citations:
        if citation in mymap:
            references += f"[{citation}] {mymap[citation]}\n"
        else:
            references += f"[{citation}] 未找到对应的URL\n"
    
    # 将参考文献附加到文章末尾
    content += references
    return content

def polish(article_path, map_path):
    # 提取文章中的引用数字
    citations = extract_citations(article_path)
    # 加载索引到URL的映射
    mymap = load_map(map_path)

    with open(article_path, 'r', encoding='utf-8') as file:
        content = file.read()
  
    content = add_ref(citations, mymap, content)
    content = remove_lines_after_marker(content, marker='---')
    content = remove_consecutive_duplicate_citations(content)
        
    with open(article_path, 'w', encoding='utf-8') as file:
        file.write(content)    
        print("参考文献已成功附加到文章末尾。")
    

def remove_consecutive_duplicate_citations(content: str) -> str:
    """
    This function removes consecutive duplicate citations in the text within the same line,
    deleting earlier instances and keeping only the last one when duplicates are adjacent.
    """
    # Split the content into lines
    lines = content.splitlines()
    
    processed_lines = []
    
    for line in lines:
        # Find all citations and split them from the text
        parts = re.split(r'(\[\d+\])', line)
        
        # List to hold the new parts after removing duplicates
        new_parts = []
        last_citation = None
        last_citation_index = -1
        
        for index, part in enumerate(parts):
            if re.match(r'\[\d+\]', part):
                if part == last_citation:
                    # If the current citation is the same as the last, remove the last one
                    new_parts.pop(last_citation_index)
                last_citation = part
                last_citation_index = len(new_parts)
            new_parts.append(part)
        
        # Reconstruct the line, ensuring we remove any trailing empty parts
        new_line = ''.join([p for p in new_parts if p != ''])
        processed_lines.append(new_line)
    
    # Join the processed lines back into a single string
    return '\n'.join(processed_lines)


# 设置路径
def post_polish(path):
    article_path = path + '/storm_gen_article_polished.txt'
    map_path = path + '/url_to_info.json' 
    polish(article_path, map_path)
  

if __name__ == '__main__':
    # path = '/mnt/nas-alinlp/xizekun/project/storm/results/gpt/台风玛莉亚'
    # article_path = path + '/storm_gen_article_polished.txt'
    # map_path = path + '/url_to_info.json'
    # # 调用函数将参考文献添加到文章末尾
    # append_references_to_article(article_path, map_path)


    string = "你好[1]你好[2]你好啊[1]你非常好[1]你非常的好[1]你非常的好[1]你非常的好[1]你好[2]"
    post_string = remove_consecutive_duplicate_citations(string)
    print(post_string)