|
import streamlit as st |
|
import javalang |
|
import torch |
|
import torch.nn.functional as F |
|
import re |
|
from transformers import AutoTokenizer, AutoModel |
|
import warnings |
|
import pandas as pd |
|
import zipfile |
|
import os |
|
|
|
|
|
st.set_page_config( |
|
page_title="Java Code Clone Detector (IJaDataset 2.1)", |
|
page_icon="π", |
|
layout="wide" |
|
) |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
MODEL_NAME = "microsoft/codebert-base" |
|
MAX_LENGTH = 512 |
|
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
DATASET_PATH = "archive (1).zip" |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
try: |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) |
|
return tokenizer, model |
|
except Exception as e: |
|
st.error(f"Failed to load models: {str(e)}") |
|
return None, None |
|
|
|
@st.cache_resource |
|
def load_dataset(): |
|
try: |
|
if not os.path.exists("Subject_CloneTypes_Directories"): |
|
with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref: |
|
zip_ref.extractall(".") |
|
|
|
clone_pairs = [] |
|
base_path = "Subject_CloneTypes_Directories" |
|
|
|
for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]: |
|
type_path = os.path.join(base_path, clone_type) |
|
if os.path.exists(type_path): |
|
for root, _, files in os.walk(type_path): |
|
if files and len(files) >= 2: |
|
with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1: |
|
code1 = f1.read() |
|
with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2: |
|
code2 = f2.read() |
|
clone_pairs.append({ |
|
"type": clone_type, |
|
"code1": code1, |
|
"code2": code2 |
|
}) |
|
break |
|
|
|
return clone_pairs[:10] |
|
except Exception as e: |
|
st.error(f"Error loading dataset: {str(e)}") |
|
return [] |
|
|
|
tokenizer, code_model = load_models() |
|
dataset_pairs = load_dataset() |
|
|
|
def normalize_code(code): |
|
try: |
|
code = re.sub(r'//.*', '', code) |
|
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) |
|
code = re.sub(r'\s+', ' ', code).strip() |
|
return code |
|
except Exception: |
|
return code |
|
|
|
def get_embedding(code): |
|
try: |
|
code = normalize_code(code) |
|
inputs = tokenizer( |
|
code, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=MAX_LENGTH, |
|
padding='max_length' |
|
).to(DEVICE) |
|
|
|
with torch.no_grad(): |
|
outputs = code_model(**inputs) |
|
|
|
return outputs.last_hidden_state.mean(dim=1) |
|
except Exception as e: |
|
st.error(f"Error processing code: {str(e)}") |
|
return None |
|
|
|
def compare_code(code1, code2): |
|
if not code1 or not code2: |
|
return None |
|
|
|
with st.spinner('Analyzing code...'): |
|
emb1 = get_embedding(code1) |
|
emb2 = get_embedding(code2) |
|
|
|
if emb1 is None or emb2 is None: |
|
return None |
|
|
|
with torch.no_grad(): |
|
similarity = F.cosine_similarity(emb1, emb2).item() |
|
|
|
return similarity |
|
|
|
|
|
st.title("π Java Code Clone Detector (IJaDataset 2.1)") |
|
st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.") |
|
|
|
|
|
selected_pair = None |
|
if dataset_pairs: |
|
pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)} |
|
selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys())) |
|
selected_pair = pair_options[selected_option] |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
code1 = st.text_area( |
|
"First Java Code", |
|
height=300, |
|
value=selected_pair["code1"] if selected_pair else "", |
|
help="Enter the first Java code snippet" |
|
) |
|
|
|
with col2: |
|
code2 = st.text_area( |
|
"Second Java Code", |
|
height=300, |
|
value=selected_pair["code2"] if selected_pair else "", |
|
help="Enter the second Java code snippet" |
|
) |
|
|
|
threshold = st.slider( |
|
"Clone Detection Threshold", |
|
min_value=0.50, |
|
max_value=1.00, |
|
value=0.75, |
|
step=0.01, |
|
help="Similarity score needed to consider code as cloned (0.5-1.0)" |
|
) |
|
|
|
|
|
if st.button("Compare Code"): |
|
similarity = compare_code(code1, code2) |
|
|
|
if similarity is not None: |
|
is_clone = similarity >= threshold |
|
|
|
st.subheader("Results") |
|
cols = st.columns(3) |
|
cols[0].metric("Similarity Score", f"{similarity:.3f}") |
|
cols[1].metric("Current Threshold", f"{threshold:.3f}") |
|
cols[2].metric( |
|
"Verdict", |
|
"β
CLONE" if is_clone else "β NOT CLONE", |
|
delta=f"{similarity-threshold:+.3f}", |
|
help=f"Score {'β₯' if is_clone else '<'} threshold" |
|
) |
|
|
|
st.progress(similarity) |
|
|
|
with st.expander("Interpretation Guide"): |
|
st.markdown(""" |
|
- **> 0.95**: Nearly identical (Type-1 clone) |
|
- **0.85-0.95**: Very similar (Type-2 clone) |
|
- **0.70-0.85**: Similar structure (Type-3 clone) |
|
- **< 0.70**: Different code |
|
""") |
|
|
|
with st.expander("Show normalized code"): |
|
tab1, tab2 = st.tabs(["First Code", "Second Code"]) |
|
with tab1: |
|
st.code(normalize_code(code1)) |
|
with tab2: |
|
st.code(normalize_code(code2)) |
|
|
|
st.markdown("---") |
|
st.markdown(""" |
|
**Dataset Information**: |
|
- Using IJaDataset 2.1 from Kaggle |
|
- Contains 100K Java files with clone annotations |
|
- Clone types: Type-1, Type-2, and Type-3 clones |
|
""") |