File size: 4,836 Bytes
8639f09 c70bb0f 8639f09 c70bb0f 8639f09 c70bb0f 8639f09 c70bb0f 4287a63 dc463e9 f304736 ecce996 f304736 ecce996 f304736 ecce996 f304736 ecce996 f304736 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
---
library_name: transformers
license: apache-2.0
base_model:
- Qwen/Qwen2.5-Coder-3B-Instruct
---
# Model Card for Model ID
Generates and Edits minimal multi-file python code. Right now consistently generates upto 2-3 files with a runner.sh bash script that orchestrates the file. Maintains the PEP-8 style.
## Model Details
### Model Description
- **Developed by:** Reshinth Adithyan
- **License:** Apache 2.0
### Model Sources [optional]
<!-- Provide the basic links for the model. -->
- **Repository:** https://github.com/reshinthadithyan/repo-level-code/tree/main
### Generated Format
The model generates the repository in the following format, Code to parse it and make a repository is also given below
```txt
<libs>pytorch,wandb</libs>
<planning>PLANNING AS MARKDOWN FORMAT</planning>
<requirements>>CONTENT FOR THE REQS FILE HERE</requirements>
<output><file1>src/dataset.py<content>YOUR PYTHON CODE HERE</content></file1>
<file2>src/model.py<content>YOUR PYTHON CODE HERE</content></file2>
<bashfile>run.sh<content>python3 src/model.py</content></bashfile></output>
```
## Example
An Example generated code is given [here](https://huggingface.co./reshinthadith/local-repo-coder-v0/blob/main/example.txt). This using the script below is processed to,
```text
Repository generated at: ./output_dir/demo2
demo2/
run.sh
src/
visualize_timeseries.py
```
### Usage
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import fire
from pathlib import Path
import os
import re
def generate_repo_from_string(input_str: str, output_dir: str) -> None:
"""
Parse <output> tags in the input string and write files (and bashfiles) to the specified output directory.
- Searches for <output>...</output> section.
- Within that, finds all <fileX> or <bashfile> tags:
<file1>path/to/file.ext<content>...file content...</content></file1>
<bashfile>script.sh<content>...script content...</content></bashfile>
Args:
input_str: The full string containing <output> markup.
output_dir: Directory where files will be created. Existing files will be overwritten.
"""
# Extract the content inside <output>...</output>
out_match = re.search(r"<output>(.*?)</output>", input_str, re.DOTALL)
if not out_match:
raise ValueError("No <output> section found in input.")
output_section = out_match.group(1)
# Regex to find file tags: file1, file2, file3, ... and bashfile
pattern = re.compile(
r"<(file\d+|bashfile)>([^<]+?)<content>(.*?)</content></\1>",
re.DOTALL
)
for tag, filename, content in pattern.findall(output_section):
# Determine full path
file_path = os.path.join(output_dir, filename.strip())
# Ensure parent directory exists
parent = os.path.dirname(file_path)
if parent:
os.makedirs(parent, exist_ok=True)
# Write content to file
with open(file_path, 'w', encoding='utf-8') as f:
# Strip only one leading newline if present
f.write(content.lstrip('\n'))
print(f"Repository generated at: {output_dir}")
def main(model_path:str="./models_dir/repo_coder_v1",
prompt:str="Generate a small python repo for matplotlib to visualize timeseries data to read from timeseries.csv file using polars."
,output_path="./output_dir/demo2"):
input_prompt = "###Instruction: {prompt}".format(prompt=prompt)
def load_model(model_path):
"""
Load the model and tokenizer from the specified path.
"""
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto").to("cuda:0")
model.eval()
return model, tokenizer
model, tokenizer = load_model(model_path)
print(f"Loaded model from {model_path}.")
input = tokenizer(input_prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
output = model.generate(**input, max_length=1024, do_sample=True, temperature=0.9, top_p=0.95, top_k=50)
generated_code_repo = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Generated code repo: {generated_code_repo}")
Path(output_path).mkdir(parents=True, exist_ok=True)
generate_repo_from_string(generated_code_repo, output_path)
def list_files(startpath):
for root, dirs, files in os.walk(startpath):
level = root.replace(startpath, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
list_files(output_path)
if __name__ == "__main__":
fire.Fire(main)
``` |