Spaces:

rahideer
/

CCD

Running

CCD

File size: 5,520 Bytes

import streamlit as st
import javalang
import torch
import torch.nn.functional as F
import re
from transformers import AutoTokenizer, AutoModel
import warnings

# Set up page config
st.set_page_config(
    page_title="Java Code Clone Detector",
    page_icon="🔍",
    layout="wide"
)

# Suppress warnings
warnings.filterwarnings("ignore")

# Constants
MODEL_NAME = "microsoft/codebert-base"
MAX_LENGTH = 512
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize models with caching
@st.cache_resource
def load_models():
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
        return tokenizer, model
    except Exception as e:
        st.error(f"Failed to load models: {str(e)}")
        return None, None

tokenizer, code_model = load_models()

# UI Elements
st.title("🔍 Java Code Clone Detector")
st.markdown("""
Compare two Java code snippets to detect potential clones using CodeBERT embeddings.
The similarity score ranges from 0 (completely different) to 1 (identical).
""")

# Example code
EXAMPLE_1 = """public class Hello {
    public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
}"""

EXAMPLE_2 = """public class Greet {
    public static void main(String[] args) {
        System.out.println("Hello, World!");
    }
}"""

# Layout
col1, col2 = st.columns(2)

with col1:
    code1 = st.text_area(
        "First Java Code", 
        height=300,
        value=EXAMPLE_1,
        help="Enter the first Java code snippet"
    )

with col2:
    code2 = st.text_area(
        "Second Java Code", 
        height=300,
        value=EXAMPLE_2,
        help="Enter the second Java code snippet"
    )

# Threshold slider
threshold = st.slider(
    "Clone Detection Threshold",
    min_value=0.5,
    max_value=1.0,
    value=0.85,
    step=0.01,
    help="Adjust the similarity threshold for clone detection"
)

# Normalization function
def normalize_code(code):
    try:
        code = re.sub(r'//.*', '', code)  # Remove single-line comments
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # Multi-line comments
        code = re.sub(r'\s+', ' ', code).strip()  # Normalize whitespace
        return code
    except Exception:
        return code

# Embedding generation
def get_embedding(code):
    try:
        code = normalize_code(code)
        inputs = tokenizer(
            code,
            return_tensors="pt",
            truncation=True,
            max_length=MAX_LENGTH,
            padding='max_length'
        ).to(DEVICE)
        
        with torch.no_grad():
            outputs = code_model(**inputs)
        
        return outputs.last_hidden_state.mean(dim=1)  # Pooled embedding
    except Exception as e:
        st.error(f"Error processing code: {str(e)}")
        return None

# Comparison function
def compare_code(code1, code2):
    if not code1 or not code2:
        return None
    
    with st.spinner('Analyzing code...'):
        emb1 = get_embedding(code1)
        emb2 = get_embedding(code2)
        
        if emb1 is None or emb2 is None:
            return None
        
        with torch.no_grad():
            similarity = F.cosine_similarity(emb1, emb2).item()
        
        return similarity

# Compare button
if st.button("Compare Code", type="primary"):
    if tokenizer is None or code_model is None:
        st.error("Models failed to load. Please check the logs.")
    else:
        similarity = compare_code(code1, code2)
        
        if similarity is not None:
            # Display results
            st.subheader("Results")
            
            # Progress bar for visualization
            st.progress(similarity)
            
            # Metrics columns
            col1, col2, col3 = st.columns(3)
            
            with col1:
                st.metric("Similarity Score", f"{similarity:.3f}")
            
            with col2:
                st.metric("Threshold", f"{threshold:.3f}")
            
            with col3:
                is_clone = similarity >= threshold
                st.metric(
                    "Clone Detection", 
                    "✅ Clone" if is_clone else "❌ Not a Clone",
                    delta=f"{similarity-threshold:+.3f}"
                )
            
            # Interpretation
            if similarity > 0.95:
                st.success("The code snippets are nearly identical (potential Type-1 clone)")
            elif similarity > 0.85:
                st.success("The code snippets are very similar (potential Type-2 clone)")
            elif similarity > 0.7:
                st.warning("The code snippets show some similarity (potential Type-3 clone)")
            else:
                st.info("The code snippets are significantly different")
            
            # Show normalized code for debugging
            with st.expander("Show normalized code"):
                tab1, tab2 = st.tabs(["First Code", "Second Code"])
                
                with tab1:
                    st.code(normalize_code(code1))
                
                with tab2:
                    st.code(normalize_code(code2))

# Footer
st.markdown("---")
st.markdown("""
**How it works**:
1. Code is normalized (comments removed, whitespace standardized)
2. CodeBERT generates embeddings for each snippet
3. Cosine similarity is calculated between embeddings
4. Results are compared against your threshold
""")