File size: 5,520 Bytes
e39d081 20e20dc e39d081 20e20dc e39d081 20e20dc e39d081 20e20dc e39d081 75f22f0 e39d081 75f22f0 e39d081 20e20dc e39d081 20e20dc e39d081 20e20dc e39d081 20e20dc e39d081 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings
# Set up page config
st.set_page_config(
page_title="Java Code Clone Detector",
page_icon="π",
layout="wide"
)
# Suppress warnings
warnings.filterwarnings("ignore")
# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Initialize models with caching
@st.cache_resource
def load_models():
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
return tokenizer, model
except Exception as e:
st.error(f"Failed to load models: {str(e)}")
return None, None
tokenizer, code_model = load_models()
# UI Elements
st.title("π Java Code Clone Detector")
st.markdown("""
Compare two Java code snippets to detect potential clones using CodeBERT embeddings.
The similarity score ranges from 0 (completely different) to 1 (identical).
""")
# Example code
EXAMPLE_1 = """public class Hello {
public static void main(String[] args) {
System.out.println("Hello, World!");
}
}"""
EXAMPLE_2 = """public class Greet {
public static void main(String[] args) {
System.out.println("Hello, World!");
}
}"""
# Layout
col1, col2 = st.columns(2)
with col1:
code1 = st.text_area(
"First Java Code",
height=300,
value=EXAMPLE_1,
help="Enter the first Java code snippet"
)
with col2:
code2 = st.text_area(
"Second Java Code",
height=300,
value=EXAMPLE_2,
help="Enter the second Java code snippet"
)
# Threshold slider
threshold = st.slider(
"Clone Detection Threshold",
min_value=0.5,
max_value=1.0,
value=0.85,
step=0.01,
help="Adjust the similarity threshold for clone detection"
)
# Normalization function
def normalize_code(code):
try:
code = re.sub(r'//.*', '', code) # Remove single-line comments
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
code = re.sub(r'\s+', ' ', code).strip() # Normalize whitespace
return code
except Exception:
return code
# Embedding generation
def get_embedding(code):
try:
code = normalize_code(code)
inputs = tokenizer(
code,
return_tensors="pt",
truncation=True,
max_length=MAX_LENGTH,
padding='max_length'
).to(DEVICE)
with torch.no_grad():
outputs = code_model(**inputs)
return outputs.last_hidden_state.mean(dim=1) # Pooled embedding
except Exception as e:
st.error(f"Error processing code: {str(e)}")
return None
# Comparison function
def compare_code(code1, code2):
if not code1 or not code2:
return None
with st.spinner('Analyzing code...'):
emb1 = get_embedding(code1)
emb2 = get_embedding(code2)
if emb1 is None or emb2 is None:
return None
with torch.no_grad():
similarity = F.cosine_similarity(emb1, emb2).item()
return similarity
# Compare button
if st.button("Compare Code", type="primary"):
if tokenizer is None or code_model is None:
st.error("Models failed to load. Please check the logs.")
else:
similarity = compare_code(code1, code2)
if similarity is not None:
# Display results
st.subheader("Results")
# Progress bar for visualization
st.progress(similarity)
# Metrics columns
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Similarity Score", f"{similarity:.3f}")
with col2:
st.metric("Threshold", f"{threshold:.3f}")
with col3:
is_clone = similarity >= threshold
st.metric(
"Clone Detection",
"β
Clone" if is_clone else "β Not a Clone",
delta=f"{similarity-threshold:+.3f}"
)
# Interpretation
if similarity > 0.95:
st.success("The code snippets are nearly identical (potential Type-1 clone)")
elif similarity > 0.85:
st.success("The code snippets are very similar (potential Type-2 clone)")
elif similarity > 0.7:
st.warning("The code snippets show some similarity (potential Type-3 clone)")
else:
st.info("The code snippets are significantly different")
# Show normalized code for debugging
with st.expander("Show normalized code"):
tab1, tab2 = st.tabs(["First Code", "Second Code"])
with tab1:
st.code(normalize_code(code1))
with tab2:
st.code(normalize_code(code2))
# Footer
st.markdown("---")
st.markdown("""
**How it works**:
1. Code is normalized (comments removed, whitespace standardized)
2. CodeBERT generates embeddings for each snippet
3. Cosine similarity is calculated between embeddings
4. Results are compared against your threshold
""") |