|
--- |
|
library_name: transformers |
|
license: apache-2.0 |
|
base_model: |
|
- Qwen/Qwen2.5-Coder-3B-Instruct |
|
--- |
|
|
|
# Model Card for Model ID |
|
|
|
Generates and Edits minimal multi-file python code. Right now consistently generates upto 2-3 files with a runner.sh bash script that orchestrates the file. Maintains the PEP-8 style. |
|
|
|
|
|
## Model Details |
|
|
|
### Model Description |
|
|
|
- **Developed by:** Reshinth Adithyan |
|
- **License:** Apache 2.0 |
|
|
|
### Model Sources [optional] |
|
|
|
<!-- Provide the basic links for the model. --> |
|
|
|
- **Repository:** https://github.com/reshinthadithyan/repo-level-code/tree/main |
|
|
|
### Generated Format |
|
The model generates the repository in the following format, Code to parse it and make a repository is also given below |
|
```txt |
|
<libs>pytorch,wandb</libs> |
|
<planning>PLANNING AS MARKDOWN FORMAT</planning> |
|
<requirements>>CONTENT FOR THE REQS FILE HERE</requirements> |
|
<output><file1>src/dataset.py<content>YOUR PYTHON CODE HERE</content></file1> |
|
<file2>src/model.py<content>YOUR PYTHON CODE HERE</content></file2> |
|
<bashfile>run.sh<content>python3 src/model.py</content></bashfile></output> |
|
|
|
``` |
|
## Example |
|
|
|
An Example generated code is given [here](https://huggingface.co./reshinthadith/local-repo-coder-v0/blob/main/example.txt). This using the script below is processed to, |
|
|
|
```text |
|
Repository generated at: ./output_dir/demo2 |
|
demo2/ |
|
run.sh |
|
src/ |
|
visualize_timeseries.py |
|
``` |
|
|
|
### Usage |
|
```python |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import fire |
|
from pathlib import Path |
|
import os |
|
import re |
|
|
|
def generate_repo_from_string(input_str: str, output_dir: str) -> None: |
|
""" |
|
Parse <output> tags in the input string and write files (and bashfiles) to the specified output directory. |
|
|
|
- Searches for <output>...</output> section. |
|
- Within that, finds all <fileX> or <bashfile> tags: |
|
<file1>path/to/file.ext<content>...file content...</content></file1> |
|
<bashfile>script.sh<content>...script content...</content></bashfile> |
|
|
|
Args: |
|
input_str: The full string containing <output> markup. |
|
output_dir: Directory where files will be created. Existing files will be overwritten. |
|
""" |
|
# Extract the content inside <output>...</output> |
|
out_match = re.search(r"<output>(.*?)</output>", input_str, re.DOTALL) |
|
if not out_match: |
|
raise ValueError("No <output> section found in input.") |
|
output_section = out_match.group(1) |
|
|
|
# Regex to find file tags: file1, file2, file3, ... and bashfile |
|
pattern = re.compile( |
|
r"<(file\d+|bashfile)>([^<]+?)<content>(.*?)</content></\1>", |
|
re.DOTALL |
|
) |
|
|
|
for tag, filename, content in pattern.findall(output_section): |
|
# Determine full path |
|
file_path = os.path.join(output_dir, filename.strip()) |
|
# Ensure parent directory exists |
|
parent = os.path.dirname(file_path) |
|
if parent: |
|
os.makedirs(parent, exist_ok=True) |
|
# Write content to file |
|
with open(file_path, 'w', encoding='utf-8') as f: |
|
# Strip only one leading newline if present |
|
f.write(content.lstrip('\n')) |
|
|
|
print(f"Repository generated at: {output_dir}") |
|
|
|
|
|
def main(model_path:str="./models_dir/repo_coder_v1", |
|
prompt:str="Generate a small python repo for matplotlib to visualize timeseries data to read from timeseries.csv file using polars." |
|
,output_path="./output_dir/demo2"): |
|
input_prompt = "###Instruction: {prompt}".format(prompt=prompt) |
|
|
|
def load_model(model_path): |
|
""" |
|
Load the model and tokenizer from the specified path. |
|
""" |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto").to("cuda:0") |
|
model.eval() |
|
return model, tokenizer |
|
|
|
|
|
model, tokenizer = load_model(model_path) |
|
print(f"Loaded model from {model_path}.") |
|
|
|
input = tokenizer(input_prompt, return_tensors="pt").to(model.device) |
|
with torch.no_grad(): |
|
output = model.generate(**input, max_length=1024, do_sample=True, temperature=0.9, top_p=0.95, top_k=50) |
|
generated_code_repo = tokenizer.decode(output[0], skip_special_tokens=True) |
|
print(f"Generated code repo: {generated_code_repo}") |
|
Path(output_path).mkdir(parents=True, exist_ok=True) |
|
generate_repo_from_string(generated_code_repo, output_path) |
|
|
|
def list_files(startpath): |
|
for root, dirs, files in os.walk(startpath): |
|
level = root.replace(startpath, '').count(os.sep) |
|
indent = ' ' * 4 * (level) |
|
print('{}{}/'.format(indent, os.path.basename(root))) |
|
subindent = ' ' * 4 * (level + 1) |
|
for f in files: |
|
print('{}{}'.format(subindent, f)) |
|
list_files(output_path) |
|
|
|
|
|
if __name__ == "__main__": |
|
fire.Fire(main) |
|
|
|
``` |