File size: 8,158 Bytes
8ba948f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import gradio as gr
import requests
import bs4
import re
import uuid
import csv
import time
import pandas as pd

class Hits:
    def __init__(self):
        self.mark_start_placeholder = str(uuid.uuid4())
        self.mark_end_placeholder = str(uuid.uuid4())
        self.hits = {}

    def _parse_snippet(self, snippet):
        matches = {}
        soup = bs4.BeautifulSoup(snippet, 'lxml')
        for tr in soup.select('tr'):
            line_num = tr.select("div.lineno")[0].text.strip()
            line = tr.select("pre")[0].decode_contents()
            if "<mark" not in line:
                continue
            else:
                line = re.sub(r'<mark[^<]*>',  self.mark_start_placeholder, line)
                line = line.replace("</mark>", self.mark_end_placeholder)
                line = bs4.BeautifulSoup(line, 'lxml').text
                matches[line_num] = line
        return matches

    def add_hit(self, repo, path, snippet):
        if repo not in self.hits:
            self.hits[repo] = {}
        if path not in self.hits[repo]:
            self.hits[repo][path] = {}
        for line_num, line in self._parse_snippet(snippet).items():
            self.hits[repo][path][line_num] = line

    def merge(self, hits2):
        for hit_repo, path_data in hits2.hits.items():
            if hit_repo not in self.hits:
                self.hits[hit_repo] = {}
            for path, lines in path_data.items():
                if path not in self.hits[hit_repo]:
                    self.hits[hit_repo][path] = {}
                for line_num, line in lines.items():
                    self.hits[hit_repo][path][line_num] = line

def fetch_grep_app(page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter):
    params = {
        'q': query,
        'page': page
    }
    url = "https://grep.app/api/search"

    if use_regex:
        params['regexp'] = 'true'
    elif whole_words:
        params['words'] = 'true'

    if case_sensitive:
        params['case'] = 'true'
    if repo_filter:
        params['f.repo.pattern'] = repo_filter
    if path_filter:
        params['f.path.pattern'] = path_filter
    response = requests.get(url, params=params)
    if response.status_code != 200:
        return None, None, 0
    data = response.json()
    count = data['facets']['count']
    hits = Hits()
    for hit_data in data['hits']['hits']:
        repo = hit_data['repo']['raw']
        path = hit_data['path']['raw']
        snippet = hit_data['content']['snippet']
        hits.add_hit(repo, path, snippet)

    if count > 10 * page:
        return page + 1, hits, count
    else:
        return None, hits, count

def extract_query_content(line, query):
    """
    Extracts everything after the query string until it encounters a `=`, backtick (`` ` ``), or closing double quote (`"`).
    """
    # Find the query string in the line
    query_index = line.find(query)
    if query_index == -1:
        return ""  # Query not found in the line

    # Extract everything after the query string
    remaining_line = line[query_index + len(query):]

    # Use regex to match until `=`, backtick, or closing double quote
    match = re.search(r'^([^=`"]+)', remaining_line)
    if match:
        return match.group(1).strip()
    return ""

def search_and_export(query, use_regex, whole_words, case_sensitive, repo_filter, path_filter, progress=gr.Progress()):
    hits = Hits()
    next_page = 1
    total_pages = 1
    total_results = 0
    while next_page and next_page < 101:
        progress(next_page / 100, desc="Fetching data...")
        next_page, page_hits, count = fetch_grep_app(next_page, query, use_regex, whole_words, case_sensitive, repo_filter, path_filter)
        if page_hits:
            hits.merge(page_hits)
            total_results += count
        if next_page is None:
            break
        time.sleep(1)

    # Export to CSV
    csv_filename = "search_results.csv"
    with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Repository", "Line Number", "Extracted Query", "Content", "Path"])  # Path moved to the end
        for repo, paths in hits.hits.items():
            for path, lines in paths.items():
                for line_num, content in lines.items():
                    extracted_query = extract_query_content(content, query)
                    writer.writerow([repo, line_num, extracted_query, content, path])  # Path moved to the end

    # Read CSV into a DataFrame for display
    df = pd.read_csv(csv_filename)
    # Filter to show only rows where `Extracted Query` is unique
    df_unique = df.drop_duplicates(subset=["Extracted Query"])
    # Limit to top 6 unique results
    df_top6_unique = df_unique.head(6)
    return csv_filename, df_top6_unique, f"**Total Results: {total_results}**", f"Displaying top 6 unique results. Download the CSV for all {total_results} results."

# Custom CSS for a modern look
custom_css = """
.gradio-container {
    font-family: 'Arial', sans-serif;
    background: linear-gradient(135deg, #f5f7fa, #c3cfe2);
    padding: 20px;
    border-radius: 10px;
}
.gradio-header {
    text-align: center;
    font-size: 24px;
    font-weight: bold;
    color: #333;
}
.gradio-header h1 {
    text-decoration: underline;
}
.gradio-header p {
    text-decoration: underline;
}
.gradio-inputs {
    background: white;
    padding: 20px;
    border-radius: 10px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.gradio-button {
    background: #4CAF50 !important;
    color: white !important;
    border-radius: 5px !important;
    padding: 10px 20px !important;
    font-size: 16px !important;
}
.gradio-button:hover {
    background: #45a049 !important;
}
.gradio-outputs {
    background: white;
    padding: 20px;
    border-radius: 10px;
    margin-top: 20px;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}
.gradio-dataframe {
    max-height: 400px;
    overflow-y: auto;
    border: 1px solid #ddd;
    border-radius: 5px;
    padding: 10px;
}
.gradio-results-count {
    font-size: 18px;
    font-weight: bold;
    color: #000; /* Black color */
    margin-bottom: 10px;
}
.gradio-download-message {
    font-size: 16px;
    color: #333;
    margin-top: 10px;
}
"""

# UI using Gradio Blocks
with gr.Blocks(css=custom_css, theme="default") as demo:
    gr.Markdown("""
    <div class="gradio-header">
        <h1>GrepVault: Search Github for API Keys</h1>
        <p><a href="https://github.com/SanshruthR/GrepVault">https://github.com/SanshruthR/GrepVault</a></p>

    </div>
    """)

    with gr.Row():
        with gr.Column(scale=2, elem_classes="gradio-inputs"):
            query = gr.Textbox(label="Search Query", placeholder="Enter your search query for example :generateContent?key=", lines=1)
            use_regex = gr.Checkbox(label="Use Regular Expression", value=False)
            whole_words = gr.Checkbox(label="Match Whole Words", value=False)
            case_sensitive = gr.Checkbox(label="Case Sensitive Search", value=False)
            repo_filter = gr.Textbox(label="Repository Filter", placeholder="e.g., user/repo", lines=1)
            path_filter = gr.Textbox(label="Path Filter", placeholder="e.g., src/", lines=1)
            search_button = gr.Button("Search and Export", elem_classes="gradio-button")

        with gr.Column(scale=3, elem_classes="gradio-outputs"):
            results_count = gr.Markdown("**Total Results: 0**", elem_classes="gradio-results-count")
            csv_download = gr.File(label="Download CSV")
            csv_preview = gr.Dataframe(label="CSV Preview (Top 6 Unique Results)", headers=["Repository", "Line Number", "Extracted Query", "Content", "Path"], elem_classes="gradio-dataframe")
            download_message = gr.Markdown("Displaying top 6 unique results. Download the CSV for all results.", elem_classes="gradio-download-message")

    search_button.click(
        search_and_export,
        inputs=[query, use_regex, whole_words, case_sensitive, repo_filter, path_filter],
        outputs=[csv_download, csv_preview, results_count, download_message]
    )

demo.launch()