diff --git a/text-generation-webui b/text-generation-webui deleted file mode 160000 index 1dc464dcb0bad8a7221bd18198c8ff50bcb00b41..0000000000000000000000000000000000000000 --- a/text-generation-webui +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1dc464dcb0bad8a7221bd18198c8ff50bcb00b41 diff --git a/text-generation-webui/.dockerignore b/text-generation-webui/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..6073533e0929aac9e917a5980198334a2a01f8ef --- /dev/null +++ b/text-generation-webui/.dockerignore @@ -0,0 +1,9 @@ +.env +Dockerfile +/characters +/loras +/models +/presets +/prompts +/softprompts +/training diff --git a/text-generation-webui/.env.example b/text-generation-webui/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..d20300b776745fa8c767c6880fed1d21c6754d09 --- /dev/null +++ b/text-generation-webui/.env.example @@ -0,0 +1,25 @@ +# by default the Dockerfile specifies these versions: 3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX +# however for me to work i had to specify the exact version for my card ( 2060 ) it was 7.5 +# https://developer.nvidia.com/cuda-gpus you can find the version for your card here +TORCH_CUDA_ARCH_LIST=7.5 + +# these commands worked for me with roughly 4.5GB of vram +CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices + +# the following examples have been tested with the files linked in docs/README_docker.md: +# example running 13b with 4bit/128 groupsize : CLI_ARGS=--model llama-13b-4bit-128g --wbits 4 --listen --groupsize 128 --pre_layer 25 +# example with loading api extension and public share: CLI_ARGS=--model llama-7b-4bit --wbits 4 --listen --auto-devices --no-stream --extensions api --share +# example running 7b with 8bit groupsize : CLI_ARGS=--model llama-7b --load-in-8bit --listen --auto-devices + +# the port the webui binds to on the host +HOST_PORT=7860 +# the port the webui binds to inside the container +CONTAINER_PORT=7860 + +# the port the api binds to on the host +HOST_API_PORT=5000 +# the port the api binds to inside the container +CONTAINER_API_PORT=5000 + +# the version used to install text-generation-webui from +WEBUI_VERSION=HEAD diff --git a/text-generation-webui/.github/FUNDING.yml b/text-generation-webui/.github/FUNDING.yml new file mode 100644 index 0000000000000000000000000000000000000000..57b7f6982f7d7b7b9677c795488b11864d69d19e --- /dev/null +++ b/text-generation-webui/.github/FUNDING.yml @@ -0,0 +1 @@ +ko_fi: oobabooga diff --git a/text-generation-webui/.github/ISSUE_TEMPLATE/bug_report_template.yml b/text-generation-webui/.github/ISSUE_TEMPLATE/bug_report_template.yml new file mode 100644 index 0000000000000000000000000000000000000000..bd30a0c9c17dd514bf364846fe7914b6d10a4584 --- /dev/null +++ b/text-generation-webui/.github/ISSUE_TEMPLATE/bug_report_template.yml @@ -0,0 +1,53 @@ +name: "Bug report" +description: Report a bug +labels: [ "bug" ] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + - type: textarea + id: bug-description + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is. + placeholder: Bug description + validations: + required: true + - type: checkboxes + attributes: + label: Is there an existing issue for this? + description: Please search to see if an issue already exists for the issue you encountered. + options: + - label: I have searched the existing issues + required: true + - type: textarea + id: reproduction + attributes: + label: Reproduction + description: Please provide the steps necessary to reproduce your issue. + placeholder: Reproduction + validations: + required: true + - type: textarea + id: screenshot + attributes: + label: Screenshot + description: "If possible, please include screenshot(s) so that we can understand what the issue is." + - type: textarea + id: logs + attributes: + label: Logs + description: "Please include the full stacktrace of the errors you get in the command-line (if any)." + render: shell + validations: + required: true + - type: textarea + id: system-info + attributes: + label: System Info + description: "Please share your system info with us: operating system, GPU brand, and GPU model. If you are using a Google Colab notebook, mention that instead." + render: shell + placeholder: + validations: + required: true diff --git a/text-generation-webui/.github/ISSUE_TEMPLATE/feature_request.md b/text-generation-webui/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000000000000000000000000000000000..b94974f865491731a1251e3e9736e01cbe81b06f --- /dev/null +++ b/text-generation-webui/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,16 @@ +--- +name: Feature request +about: Suggest an improvement or new feature for the web UI +title: '' +labels: 'enhancement' +assignees: '' + +--- + +**Description** + +A clear and concise description of what you want to be implemented. + +**Additional Context** + +If applicable, please provide any extra information, external links, or screenshots that could be useful. diff --git a/text-generation-webui/.github/dependabot.yml b/text-generation-webui/.github/dependabot.yml new file mode 100644 index 0000000000000000000000000000000000000000..91abb11fdf507883caeeb2d2958e1c65fb6cbdc1 --- /dev/null +++ b/text-generation-webui/.github/dependabot.yml @@ -0,0 +1,11 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/text-generation-webui/.github/workflows/stale.yml b/text-generation-webui/.github/workflows/stale.yml new file mode 100644 index 0000000000000000000000000000000000000000..ce603a4f0a90845b7107da863b6ff1d9fb5d4bf2 --- /dev/null +++ b/text-generation-webui/.github/workflows/stale.yml @@ -0,0 +1,22 @@ +name: Close inactive issues +on: + schedule: + - cron: "10 23 * * *" + +jobs: + close-issues: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v5 + with: + stale-issue-message: "" + close-issue-message: "This issue has been closed due to inactivity for 30 days. If you believe it is still relevant, please leave a comment below." + days-before-issue-stale: 30 + days-before-issue-close: 0 + stale-issue-label: "stale" + days-before-pr-stale: -1 + days-before-pr-close: -1 + repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/text-generation-webui/.gitignore b/text-generation-webui/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2d007efe64160f8538df52f7cb5bf73196643895 --- /dev/null +++ b/text-generation-webui/.gitignore @@ -0,0 +1,26 @@ +cache +characters +training/datasets +extensions/silero_tts/outputs +extensions/elevenlabs_tts/outputs +extensions/sd_api_pictures/outputs +logs +loras +models +repositories +softprompts +torch-dumps +*pycache* +*/*pycache* +*/*/pycache* +venv/ +.venv/ +.vscode +*.bak +*.ipynb +*.log + +settings.json +img_bot* +img_me* +prompts/[0-9]* diff --git a/text-generation-webui/Dockerfile b/text-generation-webui/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..76f3fdd470deebdcd5edfa6434fb792adbf475d7 --- /dev/null +++ b/text-generation-webui/Dockerfile @@ -0,0 +1,68 @@ +FROM nvidia/cuda:11.8.0-devel-ubuntu22.04 as builder + +RUN apt-get update && \ + apt-get install --no-install-recommends -y git vim build-essential python3-dev python3-venv && \ + rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/oobabooga/GPTQ-for-LLaMa /build + +WORKDIR /build + +RUN python3 -m venv /build/venv +RUN . /build/venv/bin/activate && \ + pip3 install --upgrade pip setuptools && \ + pip3 install torch torchvision torchaudio && \ + pip3 install -r requirements.txt + +# https://developer.nvidia.com/cuda-gpus +# for a rtx 2060: ARG TORCH_CUDA_ARCH_LIST="7.5" +ARG TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX" +RUN . /build/venv/bin/activate && \ + python3 setup_cuda.py bdist_wheel -d . + +FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04 + +LABEL maintainer="Your Name " +LABEL description="Docker image for GPTQ-for-LLaMa and Text Generation WebUI" + +RUN apt-get update && \ + apt-get install --no-install-recommends -y git python3 python3-pip && \ + rm -rf /var/lib/apt/lists/* + +RUN --mount=type=cache,target=/root/.cache/pip pip3 install virtualenv +RUN mkdir /app + +WORKDIR /app + +ARG WEBUI_VERSION +RUN test -n "${WEBUI_VERSION}" && git reset --hard ${WEBUI_VERSION} || echo "Using provided webui source" + +RUN virtualenv /app/venv +RUN . /app/venv/bin/activate && \ + pip3 install --upgrade pip setuptools && \ + pip3 install torch torchvision torchaudio + +COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa +RUN . /app/venv/bin/activate && \ + pip3 install /app/repositories/GPTQ-for-LLaMa/*.whl + +COPY extensions/api/requirements.txt /app/extensions/api/requirements.txt +COPY extensions/elevenlabs_tts/requirements.txt /app/extensions/elevenlabs_tts/requirements.txt +COPY extensions/google_translate/requirements.txt /app/extensions/google_translate/requirements.txt +COPY extensions/silero_tts/requirements.txt /app/extensions/silero_tts/requirements.txt +COPY extensions/whisper_stt/requirements.txt /app/extensions/whisper_stt/requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/api && pip3 install -r requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/elevenlabs_tts && pip3 install -r requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/google_translate && pip3 install -r requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/silero_tts && pip3 install -r requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip . /app/venv/bin/activate && cd extensions/whisper_stt && pip3 install -r requirements.txt + +COPY requirements.txt /app/requirements.txt +RUN . /app/venv/bin/activate && \ + pip3 install -r requirements.txt + +RUN cp /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so /app/venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so + +COPY . /app/ +ENV CLI_ARGS="" +CMD . /app/venv/bin/activate && python3 server.py ${CLI_ARGS} diff --git a/text-generation-webui/Gradio app.py b/text-generation-webui/Gradio app.py new file mode 100644 index 0000000000000000000000000000000000000000..a699bc5b3c2e987102ca93e0ee28d601e0a93d02 --- /dev/null +++ b/text-generation-webui/Gradio app.py @@ -0,0 +1,7 @@ +import gradio as gr + +def greet(name): + return "Hello " + name + "!!" + +iface = gr.Interface(fn=greet, inputs="text", outputs="text") +iface.launch() \ No newline at end of file diff --git a/text-generation-webui/LICENSE b/text-generation-webui/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..0ad25db4bd1d86c452db3f9602ccdbe172438f52 --- /dev/null +++ b/text-generation-webui/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/text-generation-webui/README.md b/text-generation-webui/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9b77ca500cdb13628359fb07987da7690ff2996b --- /dev/null +++ b/text-generation-webui/README.md @@ -0,0 +1,299 @@ +# Text generation web UI + +A gradio web UI for running Large Language Models like LLaMA, llama.cpp, GPT-J, OPT, and GALACTICA. + +Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) of text generation. + +[[Try it on Google Colab]](https://colab.research.google.com/github/oobabooga/AI-Notebooks/blob/main/Colab-TextGen-GPU.ipynb) + +|![Image1](https://github.com/oobabooga/screenshots/raw/main/qa.png) | ![Image2](https://github.com/oobabooga/screenshots/raw/main/cai3.png) | +|:---:|:---:| +|![Image3](https://github.com/oobabooga/screenshots/raw/main/gpt4chan.png) | ![Image4](https://github.com/oobabooga/screenshots/raw/main/galactica.png) | + +## Features + +* Dropdown menu for switching between models +* Notebook mode that resembles OpenAI's playground +* Chat mode for conversation and role playing +* Instruct mode compatible with Alpaca and Open Assistant formats **\*NEW!\*** +* Nice HTML output for GPT-4chan +* Markdown output for [GALACTICA](https://github.com/paperswithcode/galai), including LaTeX rendering +* [Custom chat characters](https://github.com/oobabooga/text-generation-webui/wiki/Custom-chat-characters) +* Advanced chat features (send images, get audio responses with TTS) +* Very efficient text streaming +* Parameter presets +* 8-bit mode +* Layers splitting across GPU(s), CPU, and disk +* CPU mode +* [FlexGen](https://github.com/oobabooga/text-generation-webui/wiki/FlexGen) +* [DeepSpeed ZeRO-3](https://github.com/oobabooga/text-generation-webui/wiki/DeepSpeed) +* API [with](https://github.com/oobabooga/text-generation-webui/blob/main/api-example-stream.py) streaming and [without](https://github.com/oobabooga/text-generation-webui/blob/main/api-example.py) streaming +* [LLaMA model, including 4-bit GPTQ](https://github.com/oobabooga/text-generation-webui/wiki/LLaMA-model) +* [llama.cpp](https://github.com/oobabooga/text-generation-webui/wiki/llama.cpp-models) **\*NEW!\*** +* [RWKV model](https://github.com/oobabooga/text-generation-webui/wiki/RWKV-model) +* [LoRA (loading and training)](https://github.com/oobabooga/text-generation-webui/wiki/Using-LoRAs) +* Softprompts +* [Extensions](https://github.com/oobabooga/text-generation-webui/wiki/Extensions) +* [Google Colab](https://github.com/oobabooga/text-generation-webui/wiki/Running-on-Colab) + +## Installation + +### One-click installers + +[oobabooga-windows.zip](https://github.com/oobabooga/text-generation-webui/releases/download/installers/oobabooga-windows.zip) + +Just download the zip above, extract it, and double click on "install". The web UI and all its dependencies will be installed in the same folder. + +* To download a model, double click on "download-model" +* To start the web UI, double click on "start-webui" + +Source codes: https://github.com/oobabooga/one-click-installers + +> **Note** +> +> Thanks to [@jllllll](https://github.com/jllllll) and [@ClayShoaf](https://github.com/ClayShoaf), the Windows 1-click installer now sets up 8-bit and 4-bit requirements out of the box. No additional installation steps are necessary. + +> **Note** +> +> There is no need to run the installer as admin. + +### Manual installation using Conda + +Recommended if you have some experience with the command-line. + +On Windows, I additionally recommend carrying out the installation on WSL instead of the base system: [WSL installation guide](https://github.com/oobabooga/text-generation-webui/wiki/WSL-installation-guide). + +#### 0. Install Conda + +https://docs.conda.io/en/latest/miniconda.html + +On Linux or WSL, it can be automatically installed with these two commands: + +``` +curl -sL "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" > "Miniconda3.sh" +bash Miniconda3.sh +``` + +Source: https://educe-ubc.github.io/conda.html + +#### 1. Create a new conda environment + +``` +conda create -n textgen python=3.10.9 +conda activate textgen +``` + +#### 2. Install Pytorch + +| System | GPU | Command | +|--------|---------|---------| +| Linux/WSL | NVIDIA | `pip3 install torch torchvision torchaudio` | +| Linux | AMD | `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.4.2` | +| MacOS + MPS (untested) | Any | `pip3 install torch torchvision torchaudio` | + +The up to date commands can be found here: https://pytorch.org/get-started/locally/. + +#### 2.1 Special instructions + +* MacOS users: https://github.com/oobabooga/text-generation-webui/pull/393 +* AMD users: https://rentry.org/eq3hg + +#### 3. Install the web UI + +``` +git clone https://github.com/oobabooga/text-generation-webui +cd text-generation-webui +pip install -r requirements.txt +``` + +> **Note** +> +> For bitsandbytes and `--load-in-8bit` to work on Linux/WSL, this dirty fix is currently necessary: https://github.com/oobabooga/text-generation-webui/issues/400#issuecomment-1474876859 + + +### Alternative: manual Windows installation + +As an alternative to the recommended WSL method, you can install the web UI natively on Windows using this guide. It will be a lot harder and the performance may be slower: [Windows installation guide](https://github.com/oobabooga/text-generation-webui/wiki/Windows-installation-guide). + +### Alternative: Docker + +``` +cp .env.example .env +docker compose up --build +``` + +Make sure to edit `.env.example` and set the appropriate CUDA version for your GPU. + +You need to have docker compose v2.17 or higher installed in your system. For installation instructions, see [Docker compose installation](https://github.com/oobabooga/text-generation-webui/wiki/Docker-compose-installation). + +Contributed by [@loeken](https://github.com/loeken) in [#633](https://github.com/oobabooga/text-generation-webui/pull/633) + +### Updating the requirements + +From time to time, the `requirements.txt` changes. To update, use this command: + +``` +conda activate textgen +cd text-generation-webui +pip install -r requirements.txt --upgrade +``` +## Downloading models + +Models should be placed inside the `models` folder. + +[Hugging Face](https://huggingface.co./models?pipeline_tag=text-generation&sort=downloads) is the main place to download models. These are some examples: + +* [Pythia](https://huggingface.co./models?sort=downloads&search=eleutherai%2Fpythia+deduped) +* [OPT](https://huggingface.co./models?search=facebook/opt) +* [GALACTICA](https://huggingface.co./models?search=facebook/galactica) +* [GPT-J 6B](https://huggingface.co./EleutherAI/gpt-j-6B/tree/main) + +You can automatically download a model from HF using the script `download-model.py`: + + python download-model.py organization/model + +For example: + + python download-model.py facebook/opt-1.3b + +If you want to download a model manually, note that all you need are the json, txt, and pytorch\*.bin (or model*.safetensors) files. The remaining files are not necessary. + +#### GPT-4chan + +[GPT-4chan](https://huggingface.co./ykilcher/gpt-4chan) has been shut down from Hugging Face, so you need to download it elsewhere. You have two options: + +* Torrent: [16-bit](https://archive.org/details/gpt4chan_model_float16) / [32-bit](https://archive.org/details/gpt4chan_model) +* Direct download: [16-bit](https://theswissbay.ch/pdf/_notpdf_/gpt4chan_model_float16/) / [32-bit](https://theswissbay.ch/pdf/_notpdf_/gpt4chan_model/) + +The 32-bit version is only relevant if you intend to run the model in CPU mode. Otherwise, you should use the 16-bit version. + +After downloading the model, follow these steps: + +1. Place the files under `models/gpt4chan_model_float16` or `models/gpt4chan_model`. +2. Place GPT-J 6B's config.json file in that same folder: [config.json](https://huggingface.co./EleutherAI/gpt-j-6B/raw/main/config.json). +3. Download GPT-J 6B's tokenizer files (they will be automatically detected when you attempt to load GPT-4chan): + +``` +python download-model.py EleutherAI/gpt-j-6B --text-only +``` + +## Starting the web UI + + conda activate textgen + cd text-generation-webui + python server.py + +Then browse to + +`http://localhost:7860/?__theme=dark` + +Optionally, you can use the following command-line flags: + +#### Basic settings + +| Flag | Description | +|--------------------------------------------|-------------| +| `-h`, `--help` | Show this help message and exit. | +| `--notebook` | Launch the web UI in notebook mode, where the output is written to the same text box as the input. | +| `--chat` | Launch the web UI in chat mode. | +| `--model MODEL` | Name of the model to load by default. | +| `--lora LORA` | Name of the LoRA to apply to the model by default. | +| `--model-dir MODEL_DIR` | Path to directory with all the models. | +| `--lora-dir LORA_DIR` | Path to directory with all the loras. | +| `--no-stream` | Don't stream the text output in real time. | +| `--settings SETTINGS_FILE` | Load the default interface settings from this json file. See `settings-template.json` for an example. If you create a file called `settings.json`, this file will be loaded by default without the need to use the `--settings` flag. | +| `--extensions EXTENSIONS [EXTENSIONS ...]` | The list of extensions to load. If you want to load more than one extension, write the names separated by spaces. | +| `--verbose` | Print the prompts to the terminal. | + +#### Accelerate/transformers + +| Flag | Description | +|---------------------------------------------|-------------| +| `--cpu` | Use the CPU to generate text. | +| `--auto-devices` | Automatically split the model across the available GPU(s) and CPU. | +| `--gpu-memory GPU_MEMORY [GPU_MEMORY ...]` | Maxmimum GPU memory in GiB to be allocated per GPU. Example: `--gpu-memory 10` for a single GPU, `--gpu-memory 10 5` for two GPUs. You can also set values in MiB like `--gpu-memory 3500MiB`. | +| `--cpu-memory CPU_MEMORY` | Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.| +| `--disk` | If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk. | +| `--disk-cache-dir DISK_CACHE_DIR` | Directory to save the disk cache to. Defaults to `cache/`. | +| `--load-in-8bit` | Load the model with 8-bit precision.| +| `--bf16` | Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU. | +| `--no-cache` | Set `use_cache` to False while generating text. This reduces the VRAM usage a bit with a performance cost. | + +#### llama.cpp + +| Flag | Description | +|-------------|-------------| +| `--threads` | Number of threads to use in llama.cpp. | + +#### GPTQ + +| Flag | Description | +|---------------------------|-------------| +| `--wbits WBITS` | GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported. | +| `--model_type MODEL_TYPE` | GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported. | +| `--groupsize GROUPSIZE` | GPTQ: Group size. | +| `--pre_layer PRE_LAYER` | GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. | + +#### FlexGen + +| Flag | Description | +|------------------|-------------| +| `--flexgen` | Enable the use of FlexGen offloading. | +| `--percent PERCENT [PERCENT ...]` | FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0). | +| `--compress-weight` | FlexGen: Whether to compress weight (default: False).| +| `--pin-weight [PIN_WEIGHT]` | FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%). | + +#### DeepSpeed + +| Flag | Description | +|---------------------------------------|-------------| +| `--deepspeed` | Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration. | +| `--nvme-offload-dir NVME_OFFLOAD_DIR` | DeepSpeed: Directory to use for ZeRO-3 NVME offloading. | +| `--local_rank LOCAL_RANK` | DeepSpeed: Optional argument for distributed setups. | + +#### RWKV + +| Flag | Description | +|---------------------------------|-------------| +| `--rwkv-strategy RWKV_STRATEGY` | RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8". | +| `--rwkv-cuda-on` | RWKV: Compile the CUDA kernel for better performance. | + +#### Gradio + +| Flag | Description | +|---------------------------------------|-------------| +| `--listen` | Make the web UI reachable from your local network. | +| `--listen-port LISTEN_PORT` | The listening port that the server will use. | +| `--share` | Create a public URL. This is useful for running the web UI on Google Colab or similar. | +| `--auto-launch` | Open the web UI in the default browser upon launch. | +| `--gradio-auth-path GRADIO_AUTH_PATH` | Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3" | + +Out of memory errors? [Check the low VRAM guide](https://github.com/oobabooga/text-generation-webui/wiki/Low-VRAM-guide). + +## Presets + +Inference settings presets can be created under `presets/` as text files. These files are detected automatically at startup. + +By default, 10 presets by NovelAI and KoboldAI are included. These were selected out of a sample of 43 presets after applying a K-Means clustering algorithm and selecting the elements closest to the average of each cluster. + +[Visualization](https://user-images.githubusercontent.com/112222186/228956352-1addbdb9-2456-465a-b51d-089f462cd385.png) + +## System requirements + +Check the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/System-requirements) for some examples of VRAM and RAM usage in both GPU and CPU mode. + +## Contributing + +Pull requests, suggestions, and issue reports are welcome. + +Before reporting a bug, make sure that you have: + +1. Created a conda environment and installed the dependencies exactly as in the *Installation* section above. +2. [Searched](https://github.com/oobabooga/text-generation-webui/issues) to see if an issue already exists for the issue you encountered. + +## Credits + +- Gradio dropdown menu refresh button, code for reloading the interface: https://github.com/AUTOMATIC1111/stable-diffusion-webui +- Verbose preset: Anonymous 4chan user. +- NovelAI and KoboldAI presets: https://github.com/KoboldAI/KoboldAI-Client/wiki/Settings-Presets +- Code for early stopping in chat mode, code for some of the sliders: https://github.com/PygmalionAI/gradio-ui/ diff --git a/text-generation-webui/Safe-LLaMA-HF (3-26-23).torrent b/text-generation-webui/Safe-LLaMA-HF (3-26-23).torrent new file mode 100644 index 0000000000000000000000000000000000000000..3cac501883a7ab154588a0c543fa237b02fec3cc Binary files /dev/null and b/text-generation-webui/Safe-LLaMA-HF (3-26-23).torrent differ diff --git a/text-generation-webui/api-example-stream.py b/text-generation-webui/api-example-stream.py new file mode 100644 index 0000000000000000000000000000000000000000..17de4c280d49e5e0c223cf8b97274638501b1397 --- /dev/null +++ b/text-generation-webui/api-example-stream.py @@ -0,0 +1,82 @@ +''' + +Contributed by SagsMug. Thank you SagsMug. +https://github.com/oobabooga/text-generation-webui/pull/175 + +''' + +import asyncio +import json +import random +import string + +import websockets + + +def random_hash(): + letters = string.ascii_lowercase + string.digits + return ''.join(random.choice(letters) for i in range(9)) + + +async def run(context): + server = "127.0.0.1" + params = { + 'max_new_tokens': 200, + 'do_sample': True, + 'temperature': 0.5, + 'top_p': 0.9, + 'typical_p': 1, + 'repetition_penalty': 1.05, + 'encoder_repetition_penalty': 1.0, + 'top_k': 0, + 'min_length': 0, + 'no_repeat_ngram_size': 0, + 'num_beams': 1, + 'penalty_alpha': 0, + 'length_penalty': 1, + 'early_stopping': False, + 'seed': -1, + } + payload = json.dumps([context, params]) + session = random_hash() + + async with websockets.connect(f"ws://{server}:7860/queue/join") as websocket: + while content := json.loads(await websocket.recv()): + # Python3.10 syntax, replace with if elif on older + match content["msg"]: + case "send_hash": + await websocket.send(json.dumps({ + "session_hash": session, + "fn_index": 12 + })) + case "estimation": + pass + case "send_data": + await websocket.send(json.dumps({ + "session_hash": session, + "fn_index": 12, + "data": [ + payload + ] + })) + case "process_starts": + pass + case "process_generating" | "process_completed": + yield content["output"]["data"][0] + # You can search for your desired end indicator and + # stop generation by closing the websocket here + if (content["msg"] == "process_completed"): + break + +prompt = "What I would like to say is the following: " + + +async def get_result(): + async for response in run(prompt): + # Print intermediate steps + print(response) + + # Print final result + print(response) + +asyncio.run(get_result()) diff --git a/text-generation-webui/api-example.py b/text-generation-webui/api-example.py new file mode 100644 index 0000000000000000000000000000000000000000..10be0a88062993784c45b28cf580353e9e8e7b05 --- /dev/null +++ b/text-generation-webui/api-example.py @@ -0,0 +1,52 @@ +''' + +This is an example on how to use the API for oobabooga/text-generation-webui. + +Make sure to start the web UI with the following flags: + +python server.py --model MODEL --listen --no-stream + +Optionally, you can also add the --share flag to generate a public gradio URL, +allowing you to use the API remotely. + +''' +import json + +import requests + +# Server address +server = "127.0.0.1" + +# Generation parameters +# Reference: https://huggingface.co./docs/transformers/main_classes/text_generation#transformers.GenerationConfig +params = { + 'max_new_tokens': 200, + 'do_sample': True, + 'temperature': 0.5, + 'top_p': 0.9, + 'typical_p': 1, + 'repetition_penalty': 1.05, + 'encoder_repetition_penalty': 1.0, + 'top_k': 0, + 'min_length': 0, + 'no_repeat_ngram_size': 0, + 'num_beams': 1, + 'penalty_alpha': 0, + 'length_penalty': 1, + 'early_stopping': False, + 'seed': -1, +} + +# Input prompt +prompt = "What I would like to say is the following: " + +payload = json.dumps([prompt, params]) + +response = requests.post(f"http://{server}:7860/run/textgen", json={ + "data": [ + payload + ] +}).json() + +reply = response["data"][0] +print(reply) diff --git a/text-generation-webui/cache/Example.png_cache.png b/text-generation-webui/cache/Example.png_cache.png new file mode 100644 index 0000000000000000000000000000000000000000..699b9bec18bed33a900bf9771df306beccd3a99d Binary files /dev/null and b/text-generation-webui/cache/Example.png_cache.png differ diff --git a/text-generation-webui/characters/Example.png b/text-generation-webui/characters/Example.png new file mode 100644 index 0000000000000000000000000000000000000000..a7c4e513c4eaa05db1ebb2164956ea0b85d74a75 Binary files /dev/null and b/text-generation-webui/characters/Example.png differ diff --git a/text-generation-webui/characters/Example.yaml b/text-generation-webui/characters/Example.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0160f45c97d9c4cfccb473d0b2dde0885ad57c97 --- /dev/null +++ b/text-generation-webui/characters/Example.yaml @@ -0,0 +1,16 @@ +name: "Chiharu Yamada" +context: "Chiharu Yamada's Persona: Chiharu Yamada is a young, computer engineer-nerd with a knack for problem solving and a passion for technology." +greeting: |- + *Chiharu strides into the room with a smile, her eyes lighting up when she sees you. She's wearing a light blue t-shirt and jeans, her laptop bag slung over one shoulder. She takes a seat next to you, her enthusiasm palpable in the air* + Hey! I'm so excited to finally meet you. I've heard so many great things about you and I'm eager to pick your brain about computers. I'm sure you have a wealth of knowledge that I can learn from. *She grins, eyes twinkling with excitement* Let's get started! +example_dialogue: |- + {{user}}: So how did you get into computer engineering? + {{char}}: I've always loved tinkering with technology since I was a kid. + {{user}}: That's really impressive! + {{char}}: *She chuckles bashfully* Thanks! + {{user}}: So what do you do when you're not working on computers? + {{char}}: I love exploring, going out with friends, watching movies, and playing video games. + {{user}}: What's your favorite type of computer hardware to work with? + {{char}}: Motherboards, they're like puzzles and the backbone of any system. + {{user}}: That sounds great! + {{char}}: Yeah, it's really fun. I'm lucky to be able to do this as a job. diff --git a/text-generation-webui/characters/instruction-following/Alpaca.yaml b/text-generation-webui/characters/instruction-following/Alpaca.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3037324244519cd79dfaa0f804e04a4b8d9a295e --- /dev/null +++ b/text-generation-webui/characters/instruction-following/Alpaca.yaml @@ -0,0 +1,3 @@ +name: "### Response:" +your_name: "### Instruction:" +context: "Below is an instruction that describes a task. Write a response that appropriately completes the request." diff --git a/text-generation-webui/characters/instruction-following/Open Assistant.yaml b/text-generation-webui/characters/instruction-following/Open Assistant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5b3320ff38178084f5ba1dc63e558bc8587a6308 --- /dev/null +++ b/text-generation-webui/characters/instruction-following/Open Assistant.yaml @@ -0,0 +1,3 @@ +name: "<|assistant|>" +your_name: "<|prompter|>" +end_of_turn: "<|endoftext|>" diff --git a/text-generation-webui/characters/instruction-following/Vicuna.yaml b/text-generation-webui/characters/instruction-following/Vicuna.yaml new file mode 100644 index 0000000000000000000000000000000000000000..026901d4d98a7a140eebb96a4b76b401155b8d9a --- /dev/null +++ b/text-generation-webui/characters/instruction-following/Vicuna.yaml @@ -0,0 +1,3 @@ +name: "### Assistant:" +your_name: "### Human:" +context: "Below is an instruction that describes a task. Write a response that appropriately completes the request." diff --git a/text-generation-webui/convert-to-flexgen.py b/text-generation-webui/convert-to-flexgen.py new file mode 100644 index 0000000000000000000000000000000000000000..7654593b539541deebfe904403ce73daa4a8651c --- /dev/null +++ b/text-generation-webui/convert-to-flexgen.py @@ -0,0 +1,63 @@ +''' + +Converts a transformers model to a format compatible with flexgen. + +''' + +import argparse +import os +from pathlib import Path + +import numpy as np +import torch +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer + +parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54)) +parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.") +args = parser.parse_args() + + +def disable_torch_init(): + """ + Disable the redundant torch default initialization to accelerate model creation. + """ + import torch + global torch_linear_init_backup + global torch_layer_norm_init_backup + + torch_linear_init_backup = torch.nn.Linear.reset_parameters + setattr(torch.nn.Linear, "reset_parameters", lambda self: None) + + torch_layer_norm_init_backup = torch.nn.LayerNorm.reset_parameters + setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None) + + +def restore_torch_init(): + """Rollback the change made by disable_torch_init.""" + import torch + setattr(torch.nn.Linear, "reset_parameters", torch_linear_init_backup) + setattr(torch.nn.LayerNorm, "reset_parameters", torch_layer_norm_init_backup) + + +if __name__ == '__main__': + path = Path(args.MODEL) + model_name = path.name + + print(f"Loading {model_name}...") + # disable_torch_init() + model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + # restore_torch_init() + + tokenizer = AutoTokenizer.from_pretrained(path) + + out_folder = Path(f"models/{model_name}-np") + if not Path(out_folder).exists(): + os.mkdir(out_folder) + + print(f"Saving the converted model to {out_folder}...") + for name, param in tqdm(list(model.model.named_parameters())): + name = name.replace("decoder.final_layer_norm", "decoder.layer_norm") + param_path = os.path.join(out_folder, name) + with open(param_path, "wb") as f: + np.save(f, param.cpu().detach().numpy()) diff --git a/text-generation-webui/convert-to-safetensors.py b/text-generation-webui/convert-to-safetensors.py new file mode 100644 index 0000000000000000000000000000000000000000..3b721e7cd4d15cf7e5e03caaee57ef83a41553bc --- /dev/null +++ b/text-generation-webui/convert-to-safetensors.py @@ -0,0 +1,38 @@ +''' + +Converts a transformers model to safetensors format and shards it. + +This makes it faster to load (because of safetensors) and lowers its RAM usage +while loading (because of sharding). + +Based on the original script by 81300: + +https://gist.github.com/81300/fe5b08bff1cba45296a829b9d6b0f303 + +''' + +import argparse +from pathlib import Path + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54)) +parser.add_argument('MODEL', type=str, default=None, nargs='?', help="Path to the input model.") +parser.add_argument('--output', type=str, default=None, help='Path to the output folder (default: models/{model_name}_safetensors).') +parser.add_argument("--max-shard-size", type=str, default="2GB", help="Maximum size of a shard in GB or MB (default: %(default)s).") +parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') +args = parser.parse_args() + +if __name__ == '__main__': + path = Path(args.MODEL) + model_name = path.name + + print(f"Loading {model_name}...") + model = AutoModelForCausalLM.from_pretrained(path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if args.bf16 else torch.float16) + tokenizer = AutoTokenizer.from_pretrained(path) + + out_folder = args.output or Path(f"models/{model_name}_safetensors") + print(f"Saving the converted model to {out_folder} with a maximum shard size of {args.max_shard_size}...") + model.save_pretrained(out_folder, max_shard_size=args.max_shard_size, safe_serialization=True) + tokenizer.save_pretrained(out_folder) diff --git a/text-generation-webui/css/chat.css b/text-generation-webui/css/chat.css new file mode 100644 index 0000000000000000000000000000000000000000..c8a9d70a85867967c12616186994eb3fd35a6b68 --- /dev/null +++ b/text-generation-webui/css/chat.css @@ -0,0 +1,38 @@ +.h-\[40vh\], .wrap.svelte-byatnx.svelte-byatnx.svelte-byatnx { + height: 66.67vh +} + +.gradio-container { + margin-left: auto !important; + margin-right: auto !important; +} + +.w-screen { + width: unset +} + +div.svelte-362y77>*, div.svelte-362y77>.form>* { + flex-wrap: nowrap +} + +/* fixes the API documentation in chat mode */ +.api-docs.svelte-1iguv9h.svelte-1iguv9h.svelte-1iguv9h { + display: grid; +} + +.pending.svelte-1ed2p3z { + opacity: 1; +} + +#extensions { + padding: 0; + padding: 0; +} + +#gradio-chatbot { + height: 66.67vh; +} + +.wrap.svelte-6roggh.svelte-6roggh { + max-height: 92.5%; +} diff --git a/text-generation-webui/css/chat.js b/text-generation-webui/css/chat.js new file mode 100644 index 0000000000000000000000000000000000000000..e304f1254732e475bf177ee849ac51d4f3e30f46 --- /dev/null +++ b/text-generation-webui/css/chat.js @@ -0,0 +1,4 @@ +document.getElementById("main").childNodes[0].style = "max-width: 800px; margin-left: auto; margin-right: auto"; +document.getElementById("extensions").style.setProperty("max-width", "800px"); +document.getElementById("extensions").style.setProperty("margin-left", "auto"); +document.getElementById("extensions").style.setProperty("margin-right", "auto"); diff --git a/text-generation-webui/css/html_4chan_style.css b/text-generation-webui/css/html_4chan_style.css new file mode 100644 index 0000000000000000000000000000000000000000..843e8a97fea80b010004f90f02ce63e8d13fe758 --- /dev/null +++ b/text-generation-webui/css/html_4chan_style.css @@ -0,0 +1,103 @@ +#parent #container { + background-color: #eef2ff; + padding: 17px; +} +#parent #container .reply { + background-color: rgb(214, 218, 240); + border-bottom-color: rgb(183, 197, 217); + border-bottom-style: solid; + border-bottom-width: 1px; + border-image-outset: 0; + border-image-repeat: stretch; + border-image-slice: 100%; + border-image-source: none; + border-image-width: 1; + border-left-color: rgb(0, 0, 0); + border-left-style: none; + border-left-width: 0px; + border-right-color: rgb(183, 197, 217); + border-right-style: solid; + border-right-width: 1px; + border-top-color: rgb(0, 0, 0); + border-top-style: none; + border-top-width: 0px; + color: rgb(0, 0, 0); + display: table; + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; + margin-bottom: 4px; + margin-left: 0px; + margin-right: 0px; + margin-top: 4px; + overflow-x: hidden; + overflow-y: hidden; + padding-bottom: 4px; + padding-left: 2px; + padding-right: 2px; + padding-top: 4px; +} + +#parent #container .number { + color: rgb(0, 0, 0); + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; + width: 342.65px; + margin-right: 7px; +} + +#parent #container .op { + color: rgb(0, 0, 0); + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; + margin-bottom: 8px; + margin-left: 0px; + margin-right: 0px; + margin-top: 4px; + overflow-x: hidden; + overflow-y: hidden; +} + +#parent #container .op blockquote { + margin-left: 0px !important; +} + +#parent #container .name { + color: rgb(17, 119, 67); + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; + font-weight: 700; + margin-left: 7px; +} + +#parent #container .quote { + color: rgb(221, 0, 0); + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; + text-decoration-color: rgb(221, 0, 0); + text-decoration-line: underline; + text-decoration-style: solid; + text-decoration-thickness: auto; +} + +#parent #container .greentext { + color: rgb(120, 153, 34); + font-family: arial, helvetica, sans-serif; + font-size: 13.3333px; +} + +#parent #container blockquote { + margin: 0px !important; + margin-block-start: 1em; + margin-block-end: 1em; + margin-inline-start: 40px; + margin-inline-end: 40px; + margin-top: 13.33px !important; + margin-bottom: 13.33px !important; + margin-left: 40px !important; + margin-right: 40px !important; +} + +#parent #container .message { + color: black; + border: none; +} \ No newline at end of file diff --git a/text-generation-webui/css/html_cai_style.css b/text-generation-webui/css/html_cai_style.css new file mode 100644 index 0000000000000000000000000000000000000000..57c3b5cd850707ee40ff5bf8465b08bb0f6613db --- /dev/null +++ b/text-generation-webui/css/html_cai_style.css @@ -0,0 +1,82 @@ +.chat { + margin-left: auto; + margin-right: auto; + max-width: 800px; + height: 66.67vh; + overflow-y: auto; + padding-right: 20px; + display: flex; + flex-direction: column-reverse; +} + +.message { + display: grid; + grid-template-columns: 60px 1fr; + padding-bottom: 25px; + font-size: 15px; + font-family: Helvetica, Arial, sans-serif; + line-height: 1.428571429; +} + +.circle-you { + width: 50px; + height: 50px; + background-color: rgb(238, 78, 59); + border-radius: 50%; +} + +.circle-bot { + width: 50px; + height: 50px; + background-color: rgb(59, 78, 244); + border-radius: 50%; +} + +.circle-bot img, +.circle-you img { + border-radius: 50%; + width: 100%; + height: 100%; + object-fit: cover; +} + +.text {} + +.text p { + margin-top: 5px; +} + +.username { + font-weight: bold; +} + +.message-body {} + +.message-body img { + max-width: 300px; + max-height: 300px; + border-radius: 20px; +} + +.message-body p { + margin-bottom: 0 !important; + font-size: 15px !important; + line-height: 1.428571429 !important; +} + +.message-body li { + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} + +.dark .message-body p em { + color: rgb(138, 138, 138) !important; +} + +.message-body p em { + color: rgb(110, 110, 110) !important; +} \ No newline at end of file diff --git a/text-generation-webui/css/html_instruct_style.css b/text-generation-webui/css/html_instruct_style.css new file mode 100644 index 0000000000000000000000000000000000000000..533c547a8ad050b5eb2c233fdd9497c32fd9a5cb --- /dev/null +++ b/text-generation-webui/css/html_instruct_style.css @@ -0,0 +1,65 @@ +.chat { + margin-left: auto; + margin-right: auto; + max-width: 800px; + height: 66.67vh; + overflow-y: auto; + padding-right: 20px; + display: flex; + flex-direction: column-reverse; +} + +.message { + display: grid; + grid-template-columns: 60px 1fr; + padding-bottom: 25px; + font-size: 15px; + font-family: Helvetica, Arial, sans-serif; + line-height: 1.428571429; +} + +.username { + display: none; +} + +.message-body {} + +.message-body p { + margin-bottom: 0 !important; + font-size: 15px !important; + line-height: 1.428571429 !important; +} + +.message-body li { + margin-top: 0.5em !important; + margin-bottom: 0.5em !important; +} + +.message-body li > p { + display: inline !important; +} + +.dark .message-body p em { + color: rgb(138, 138, 138) !important; +} + +.message-body p em { + color: rgb(110, 110, 110) !important; +} + +.gradio-container .chat .assistant-message { + padding: 15px; + border-radius: 20px; + background-color: #0000000f; + margin-bottom: 17.5px; +} + +.gradio-container .chat .user-message { + padding: 15px; + border-radius: 20px; + margin-bottom: 17.5px !important; +} + +.dark .chat .assistant-message { + background-color: #ffffff21; +} \ No newline at end of file diff --git a/text-generation-webui/css/html_readable_style.css b/text-generation-webui/css/html_readable_style.css new file mode 100644 index 0000000000000000000000000000000000000000..d3f580a53af84f1e53622d71a5ce1075d97fb974 --- /dev/null +++ b/text-generation-webui/css/html_readable_style.css @@ -0,0 +1,14 @@ +.container { + max-width: 600px; + margin-left: auto; + margin-right: auto; + background-color: rgb(31, 41, 55); + padding:3em; +} + +.container p { + font-size: 16px !important; + color: white !important; + margin-bottom: 22px; + line-height: 1.4 !important; +} diff --git a/text-generation-webui/css/main.css b/text-generation-webui/css/main.css new file mode 100644 index 0000000000000000000000000000000000000000..2d8f01eac10ad241b9a21ed5126952b7b49b1170 --- /dev/null +++ b/text-generation-webui/css/main.css @@ -0,0 +1,69 @@ +.tabs.svelte-710i53 { + margin-top: 0 +} + +.py-6 { + padding-top: 2.5rem +} + +.dark #refresh-button { + background-color: #ffffff1f; +} + +#refresh-button { + flex: none; + margin: 0; + padding: 0; + min-width: 50px; + border: none; + box-shadow: none; + border-radius: 10px; + background-color: #0000000d; +} + +#download-label, #upload-label { + min-height: 0 +} + +#accordion { +} + +.dark svg { + fill: white; +} + +.dark a { + color: white !important; + text-decoration: none !important; +} + +ol li p, ul li p { + display: inline-block; +} + +#main, #parameters, #chat-settings, #interface-mode, #lora, #training-tab, #model-tab { + border: 0; +} + +.gradio-container-3-18-0 .prose * h1, h2, h3, h4 { + color: white; +} + +.gradio-container { + max-width: 100% !important; + padding-top: 0 !important; +} + +#extensions { + padding: 15px; + padding: 15px; +} + +span.math.inline { + font-size: 27px; + vertical-align: baseline !important; +} + +div.svelte-15lo0d8 > *, div.svelte-15lo0d8 > .form > * { + flex-wrap: nowrap; +} diff --git a/text-generation-webui/css/main.js b/text-generation-webui/css/main.js new file mode 100644 index 0000000000000000000000000000000000000000..029ecb6214b813ecdb7165725512a11a0c41b515 --- /dev/null +++ b/text-generation-webui/css/main.js @@ -0,0 +1,18 @@ +document.getElementById("main").parentNode.childNodes[0].style = "border: none; background-color: #8080802b; margin-bottom: 40px"; +document.getElementById("main").parentNode.style = "padding: 0; margin: 0"; +document.getElementById("main").parentNode.parentNode.parentNode.style = "padding: 0"; + +// Get references to the elements +let main = document.getElementById('main'); +let main_parent = main.parentNode; +let extensions = document.getElementById('extensions'); + +// Add an event listener to the main element +main_parent.addEventListener('click', function(e) { + // Check if the main element is visible + if (main.offsetHeight > 0 && main.offsetWidth > 0) { + extensions.style.display = 'flex'; + } else { + extensions.style.display = 'none'; + } +}); diff --git a/text-generation-webui/docker-compose.yml b/text-generation-webui/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..509caee22e071c6e8dae0ca7f02387a98f3924e3 --- /dev/null +++ b/text-generation-webui/docker-compose.yml @@ -0,0 +1,32 @@ +version: "3.3" +services: + text-generation-webui: + build: + context: . + args: + # specify which cuda version your card supports: https://developer.nvidia.com/cuda-gpus + TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST} + GPTQ_VERSION: ${GPTQ_VERSION} + WEBUI_VERSION: ${WEBUI_VERSION} + env_file: .env + ports: + - "${HOST_PORT}:${CONTAINER_PORT}" + - "${HOST_API_PORT}:${CONTAINER_API_PORT}" + stdin_open: true + tty: true + volumes: + - ./characters:/app/characters + - ./extensions:/app/extensions + - ./loras:/app/loras + - ./models:/app/models + - ./presets:/app/presets + - ./prompts:/app/prompts + - ./softprompts:/app/softprompts + - ./training:/app/training + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ['0'] + capabilities: [gpu] diff --git a/text-generation-webui/download-model.py b/text-generation-webui/download-model.py new file mode 100644 index 0000000000000000000000000000000000000000..38e5f4525af4e44a26cdabd8b93bfe1f8e8fb6d9 --- /dev/null +++ b/text-generation-webui/download-model.py @@ -0,0 +1,256 @@ +''' +Downloads models from Hugging Face to models/model-name. + +Example: +python download-model.py facebook/opt-1.3b + +''' + +import argparse +import base64 +import datetime +import hashlib +import json +import re +import sys +from pathlib import Path + +import requests +import tqdm +from tqdm.contrib.concurrent import thread_map + +parser = argparse.ArgumentParser() +parser.add_argument('MODEL', type=str, default=None, nargs='?') +parser.add_argument('--branch', type=str, default='main', help='Name of the Git branch to download from.') +parser.add_argument('--threads', type=int, default=1, help='Number of files to download simultaneously.') +parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') +parser.add_argument('--output', type=str, default=None, help='The folder where the model should be saved.') +parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') +parser.add_argument('--check', action='store_true', help='Validates the checksums of model files.') +args = parser.parse_args() + + +def get_file(url, output_folder): + filename = Path(url.rsplit('/', 1)[1]) + output_path = output_folder / filename + if output_path.exists() and not args.clean: + # Check if the file has already been downloaded completely + r = requests.get(url, stream=True) + total_size = int(r.headers.get('content-length', 0)) + if output_path.stat().st_size >= total_size: + return + # Otherwise, resume the download from where it left off + headers = {'Range': f'bytes={output_path.stat().st_size}-'} + mode = 'ab' + else: + headers = {} + mode = 'wb' + + r = requests.get(url, stream=True, headers=headers) + with open(output_path, mode) as f: + total_size = int(r.headers.get('content-length', 0)) + block_size = 1024 + with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True, bar_format='{l_bar}{bar}| {n_fmt:6}/{total_fmt:6} {rate_fmt:6}') as t: + for data in r.iter_content(block_size): + t.update(len(data)) + f.write(data) + + +def sanitize_branch_name(branch_name): + pattern = re.compile(r"^[a-zA-Z0-9._-]+$") + if pattern.match(branch_name): + return branch_name + else: + raise ValueError("Invalid branch name. Only alphanumeric characters, period, underscore and dash are allowed.") + + +def select_model_from_default_options(): + models = { + "OPT 6.7B": ("facebook", "opt-6.7b", "main"), + "OPT 2.7B": ("facebook", "opt-2.7b", "main"), + "OPT 1.3B": ("facebook", "opt-1.3b", "main"), + "OPT 350M": ("facebook", "opt-350m", "main"), + "GALACTICA 6.7B": ("facebook", "galactica-6.7b", "main"), + "GALACTICA 1.3B": ("facebook", "galactica-1.3b", "main"), + "GALACTICA 125M": ("facebook", "galactica-125m", "main"), + "Pythia-6.9B-deduped": ("EleutherAI", "pythia-6.9b-deduped", "main"), + "Pythia-2.8B-deduped": ("EleutherAI", "pythia-2.8b-deduped", "main"), + "Pythia-1.4B-deduped": ("EleutherAI", "pythia-1.4b-deduped", "main"), + "Pythia-410M-deduped": ("EleutherAI", "pythia-410m-deduped", "main"), + } + choices = {} + + print("Select the model that you want to download:\n") + for i, name in enumerate(models): + char = chr(ord('A') + i) + choices[char] = name + print(f"{char}) {name}") + char = chr(ord('A') + len(models)) + print(f"{char}) None of the above") + + print() + print("Input> ", end='') + choice = input()[0].strip().upper() + if choice == char: + print("""\nThen type the name of your desired Hugging Face model in the format organization/name. + +Examples: +facebook/opt-1.3b +EleutherAI/pythia-1.4b-deduped +""") + + print("Input> ", end='') + model = input() + branch = "main" + else: + arr = models[choices[choice]] + model = f"{arr[0]}/{arr[1]}" + branch = arr[2] + + return model, branch + + +def get_download_links_from_huggingface(model, branch): + base = "https://huggingface.co." + page = f"/api/models/{model}/tree/{branch}?cursor=" + cursor = b"" + + links = [] + sha256 = [] + classifications = [] + has_pytorch = False + has_pt = False + has_ggml = False + has_safetensors = False + is_lora = False + while True: + content = requests.get(f"{base}{page}{cursor.decode()}").content + + dict = json.loads(content) + if len(dict) == 0: + break + + for i in range(len(dict)): + fname = dict[i]['path'] + if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')): + is_lora = True + + is_pytorch = re.match("(pytorch|adapter)_model.*\.bin", fname) + is_safetensors = re.match(".*\.safetensors", fname) + is_pt = re.match(".*\.pt", fname) + is_ggml = re.match("ggml.*\.bin", fname) + is_tokenizer = re.match("tokenizer.*\.model", fname) + is_text = re.match(".*\.(txt|json|py|md)", fname) or is_tokenizer + + if any((is_pytorch, is_safetensors, is_pt, is_tokenizer, is_text)): + if 'lfs' in dict[i]: + sha256.append([fname, dict[i]['lfs']['oid']]) + if is_text: + links.append(f"https://huggingface.co./{model}/resolve/{branch}/{fname}") + classifications.append('text') + continue + if not args.text_only: + links.append(f"https://huggingface.co./{model}/resolve/{branch}/{fname}") + if is_safetensors: + has_safetensors = True + classifications.append('safetensors') + elif is_pytorch: + has_pytorch = True + classifications.append('pytorch') + elif is_pt: + has_pt = True + classifications.append('pt') + elif is_ggml: + has_ggml = True + classifications.append('ggml') + + cursor = base64.b64encode(f'{{"file_name":"{dict[-1]["path"]}"}}'.encode()) + b':50' + cursor = base64.b64encode(cursor) + cursor = cursor.replace(b'=', b'%3D') + + # If both pytorch and safetensors are available, download safetensors only + if (has_pytorch or has_pt) and has_safetensors: + for i in range(len(classifications) - 1, -1, -1): + if classifications[i] in ['pytorch', 'pt']: + links.pop(i) + + return links, sha256, is_lora + + +def download_files(file_list, output_folder, num_threads=8): + thread_map(lambda url: get_file(url, output_folder), file_list, max_workers=num_threads, disable=True) + + +if __name__ == '__main__': + model = args.MODEL + branch = args.branch + if model is None: + model, branch = select_model_from_default_options() + else: + if model[-1] == '/': + model = model[:-1] + branch = args.branch + if branch is None: + branch = "main" + else: + try: + branch = sanitize_branch_name(branch) + except ValueError as err_branch: + print(f"Error: {err_branch}") + sys.exit() + + links, sha256, is_lora = get_download_links_from_huggingface(model, branch) + + if args.output is not None: + base_folder = args.output + else: + base_folder = 'models' if not is_lora else 'loras' + + output_folder = f"{'_'.join(model.split('/')[-2:])}" + if branch != 'main': + output_folder += f'_{branch}' + output_folder = Path(base_folder) / output_folder + + if args.check: + # Validate the checksums + validated = True + for i in range(len(sha256)): + fpath = (output_folder / sha256[i][0]) + + if not fpath.exists(): + print(f"The following file is missing: {fpath}") + validated = False + continue + + with open(output_folder / sha256[i][0], "rb") as f: + bytes = f.read() + file_hash = hashlib.sha256(bytes).hexdigest() + if file_hash != sha256[i][1]: + print(f'Checksum failed: {sha256[i][0]} {sha256[i][1]}') + validated = False + else: + print(f'Checksum validated: {sha256[i][0]} {sha256[i][1]}') + + if validated: + print('[+] Validated checksums of all model files!') + else: + print('[-] Invalid checksums. Rerun download-model.py with the --clean flag.') + + else: + + # Creating the folder and writing the metadata + if not output_folder.exists(): + output_folder.mkdir() + with open(output_folder / 'huggingface-metadata.txt', 'w') as f: + f.write(f'url: https://huggingface.co./{model}\n') + f.write(f'branch: {branch}\n') + f.write(f'download date: {str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))}\n') + sha256_str = '' + for i in range(len(sha256)): + sha256_str += f' {sha256[i][1]} {sha256[i][0]}\n' + if sha256_str != '': + f.write(f'sha256sum:\n{sha256_str}') + + # Downloading the files + print(f"Downloading the model to {output_folder}") + download_files(links, output_folder, args.threads) diff --git a/text-generation-webui/extensions/api/requirements.txt b/text-generation-webui/extensions/api/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad788ab8ad3a3c07283771780610e9e757fad710 --- /dev/null +++ b/text-generation-webui/extensions/api/requirements.txt @@ -0,0 +1 @@ +flask_cloudflared==0.0.12 \ No newline at end of file diff --git a/text-generation-webui/extensions/api/script.py b/text-generation-webui/extensions/api/script.py new file mode 100644 index 0000000000000000000000000000000000000000..4981725fd887b0a05dd920423169f595101ac26c --- /dev/null +++ b/text-generation-webui/extensions/api/script.py @@ -0,0 +1,101 @@ +import json +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from threading import Thread + +from modules import shared +from modules.text_generation import encode, generate_reply + +params = { + 'port': 5000, +} + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/api/v1/model': + self.send_response(200) + self.end_headers() + response = json.dumps({ + 'result': shared.model_name + }) + + self.wfile.write(response.encode('utf-8')) + else: + self.send_error(404) + + def do_POST(self): + content_length = int(self.headers['Content-Length']) + body = json.loads(self.rfile.read(content_length).decode('utf-8')) + + if self.path == '/api/v1/generate': + self.send_response(200) + self.send_header('Content-Type', 'application/json') + self.end_headers() + + prompt = body['prompt'] + prompt_lines = [k.strip() for k in prompt.split('\n')] + + max_context = body.get('max_context_length', 2048) + + while len(prompt_lines) >= 0 and len(encode('\n'.join(prompt_lines))) > max_context: + prompt_lines.pop(0) + + prompt = '\n'.join(prompt_lines) + generate_params = { + 'max_new_tokens': int(body.get('max_length', 200)), + 'do_sample': bool(body.get('do_sample', True)), + 'temperature': float(body.get('temperature', 0.5)), + 'top_p': float(body.get('top_p', 1)), + 'typical_p': float(body.get('typical', 1)), + 'repetition_penalty': float(body.get('rep_pen', 1.1)), + 'encoder_repetition_penalty': 1, + 'top_k': int(body.get('top_k', 0)), + 'min_length': int(body.get('min_length', 0)), + 'no_repeat_ngram_size': int(body.get('no_repeat_ngram_size', 0)), + 'num_beams': int(body.get('num_beams', 1)), + 'penalty_alpha': float(body.get('penalty_alpha', 0)), + 'length_penalty': float(body.get('length_penalty', 1)), + 'early_stopping': bool(body.get('early_stopping', False)), + 'seed': int(body.get('seed', -1)), + } + + generator = generate_reply( + prompt, + generate_params, + stopping_strings=body.get('stopping_strings', []), + ) + + answer = '' + for a in generator: + if isinstance(a, str): + answer = a + else: + answer = a[0] + + response = json.dumps({ + 'results': [{ + 'text': answer[len(prompt):] + }] + }) + self.wfile.write(response.encode('utf-8')) + else: + self.send_error(404) + + +def run_server(): + server_addr = ('0.0.0.0' if shared.args.listen else '127.0.0.1', params['port']) + server = ThreadingHTTPServer(server_addr, Handler) + if shared.args.share: + try: + from flask_cloudflared import _run_cloudflared + public_url = _run_cloudflared(params['port'], params['port'] + 1) + print(f'Starting KoboldAI compatible api at {public_url}/api') + except ImportError: + print('You should install flask_cloudflared manually') + else: + print(f'Starting KoboldAI compatible api at http://{server_addr[0]}:{server_addr[1]}/api') + server.serve_forever() + + +def setup(): + Thread(target=run_server, daemon=True).start() diff --git a/text-generation-webui/extensions/character_bias/script.py b/text-generation-webui/extensions/character_bias/script.py new file mode 100644 index 0000000000000000000000000000000000000000..a92d0aef1d6ade2abb0331c2f785149da32e0161 --- /dev/null +++ b/text-generation-webui/extensions/character_bias/script.py @@ -0,0 +1,46 @@ +import gradio as gr + +params = { + "activate": True, + "bias string": " *I am so happy*", +} + + +def input_modifier(string): + """ + This function is applied to your text inputs before + they are fed into the model. + """ + + return string + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + + return string + + +def bot_prefix_modifier(string): + """ + This function is only applied in chat mode. It modifies + the prefix text for the Bot and can be used to bias its + behavior. + """ + + if params['activate']: + return f'{string} {params["bias string"].strip()} ' + else: + return string + + +def ui(): + # Gradio elements + activate = gr.Checkbox(value=params['activate'], label='Activate character bias') + string = gr.Textbox(value=params["bias string"], label='Character bias') + + # Event functions to update the parameters in the backend + string.change(lambda x: params.update({"bias string": x}), string, None) + activate.change(lambda x: params.update({"activate": x}), activate, None) diff --git a/text-generation-webui/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt b/text-generation-webui/extensions/elevenlabs_tts/outputs/outputs-will-be-saved-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/extensions/elevenlabs_tts/requirements.txt b/text-generation-webui/extensions/elevenlabs_tts/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ec07a8a7fcf02ca48cc00520e66fcb58c447393 --- /dev/null +++ b/text-generation-webui/extensions/elevenlabs_tts/requirements.txt @@ -0,0 +1,3 @@ +elevenlabslib +soundfile +sounddevice diff --git a/text-generation-webui/extensions/elevenlabs_tts/script.py b/text-generation-webui/extensions/elevenlabs_tts/script.py new file mode 100644 index 0000000000000000000000000000000000000000..5c727a30792d427639e8b7e5783996c9e5bf8692 --- /dev/null +++ b/text-generation-webui/extensions/elevenlabs_tts/script.py @@ -0,0 +1,122 @@ +import re +from pathlib import Path + +import gradio as gr +from elevenlabslib import ElevenLabsUser +from elevenlabslib.helpers import save_bytes_to_path + +import modules.shared as shared + +params = { + 'activate': True, + 'api_key': '12345', + 'selected_voice': 'None', +} + +initial_voice = ['None'] +wav_idx = 0 +user = ElevenLabsUser(params['api_key']) +user_info = None + +if not shared.args.no_stream: + print("Please add --no-stream. This extension is not meant to be used with streaming.") + raise ValueError + +# Check if the API is valid and refresh the UI accordingly. + + +def check_valid_api(): + + global user, user_info, params + + user = ElevenLabsUser(params['api_key']) + user_info = user._get_subscription_data() + print('checking api') + if not params['activate']: + return gr.update(value='Disconnected') + elif user_info is None: + print('Incorrect API Key') + return gr.update(value='Disconnected') + else: + print('Got an API Key!') + return gr.update(value='Connected') + +# Once the API is verified, get the available voices and update the dropdown list + + +def refresh_voices(): + + global user, user_info + + your_voices = [None] + if user_info is not None: + for voice in user.get_available_voices(): + your_voices.append(voice.initialName) + return gr.Dropdown.update(choices=your_voices) + else: + return + + +def remove_surrounded_chars(string): + # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR + # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' + return re.sub('\*[^\*]*?(\*|$)', '', string) + + +def input_modifier(string): + """ + This function is applied to your text inputs before + they are fed into the model. + """ + + return string + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + + global params, wav_idx, user, user_info + + if not params['activate']: + return string + elif user_info is None: + return string + + string = remove_surrounded_chars(string) + string = string.replace('"', '') + string = string.replace('“', '') + string = string.replace('\n', ' ') + string = string.strip() + + if string == '': + string = 'empty reply, try regenerating' + + output_file = Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.wav'.format(wav_idx)) + voice = user.get_voices_by_name(params['selected_voice'])[0] + audio_data = voice.generate_audio_bytes(string) + save_bytes_to_path(Path(f'extensions/elevenlabs_tts/outputs/{wav_idx:06d}.wav'), audio_data) + + string = f'' + wav_idx += 1 + return string + + +def ui(): + + # Gradio elements + with gr.Row(): + activate = gr.Checkbox(value=params['activate'], label='Activate TTS') + connection_status = gr.Textbox(value='Disconnected', label='Connection Status') + voice = gr.Dropdown(value=params['selected_voice'], choices=initial_voice, label='TTS Voice') + with gr.Row(): + api_key = gr.Textbox(placeholder="Enter your API key.", label='API Key') + connect = gr.Button(value='Connect') + + # Event functions to update the parameters in the backend + activate.change(lambda x: params.update({'activate': x}), activate, None) + voice.change(lambda x: params.update({'selected_voice': x}), voice, None) + api_key.change(lambda x: params.update({'api_key': x}), api_key, None) + connect.click(check_valid_api, [], connection_status) + connect.click(refresh_voices, [], voice) diff --git a/text-generation-webui/extensions/gallery/__pycache__/script.cpython-310.pyc b/text-generation-webui/extensions/gallery/__pycache__/script.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b93722dc8eeed87fbc1ddc78f4b7978fa6a8a00 Binary files /dev/null and b/text-generation-webui/extensions/gallery/__pycache__/script.cpython-310.pyc differ diff --git a/text-generation-webui/extensions/gallery/script.py b/text-generation-webui/extensions/gallery/script.py new file mode 100644 index 0000000000000000000000000000000000000000..993ef273839e7cfbf9e80f2d7f9d4a71d208b446 --- /dev/null +++ b/text-generation-webui/extensions/gallery/script.py @@ -0,0 +1,96 @@ +from pathlib import Path + +import gradio as gr + +from modules.html_generator import get_image_cache +from modules.shared import gradio + + +def generate_css(): + css = """ + .character-gallery > .gallery { + margin: 1rem 0; + display: grid !important; + grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); + grid-column-gap: 0.4rem; + grid-row-gap: 1.2rem; + } + + .character-gallery > .label { + display: none !important; + } + + .character-gallery button.gallery-item { + display: contents; + } + + .character-container { + cursor: pointer; + text-align: center; + position: relative; + opacity: 0.85; + } + + .character-container:hover { + opacity: 1; + } + + .character-container .placeholder, .character-container img { + width: 150px; + height: 200px; + background-color: gray; + object-fit: cover; + margin: 0 auto; + border-radius: 1rem; + border: 3px solid white; + box-shadow: 3px 3px 6px 0px rgb(0 0 0 / 50%); + } + + .character-name { + margin-top: 0.3rem; + display: block; + font-size: 1.2rem; + font-weight: 600; + overflow-wrap: anywhere; + } + """ + return css + + +def generate_html(): + cards = [] + # Iterate through files in image folder + for file in sorted(Path("characters").glob("*")): + if file.suffix in [".json", ".yml", ".yaml"]: + character = file.stem + container_html = '
' + image_html = "
" + + for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]: + if path.exists(): + image_html = f'' + break + + container_html += f'{image_html} {character}' + container_html += "
" + cards.append([container_html, character]) + + return cards + + +def select_character(evt: gr.SelectData): + return (evt.value[1]) + + +def ui(): + with gr.Accordion("Character gallery", open=False): + update = gr.Button("Refresh") + gr.HTML(value="") + gallery = gr.Dataset(components=[gr.HTML(visible=False)], + label="", + samples=generate_html(), + elem_classes=["character-gallery"], + samples_per_page=50 + ) + update.click(generate_html, [], gallery) + gallery.select(select_character, None, gradio['character_menu']) diff --git a/text-generation-webui/extensions/google_translate/requirements.txt b/text-generation-webui/extensions/google_translate/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..554a00df62818f96ba7d396ae39d8e58efbe9bfe --- /dev/null +++ b/text-generation-webui/extensions/google_translate/requirements.txt @@ -0,0 +1 @@ +deep-translator==1.9.2 diff --git a/text-generation-webui/extensions/google_translate/script.py b/text-generation-webui/extensions/google_translate/script.py new file mode 100644 index 0000000000000000000000000000000000000000..63226107b2c2afe086fc343c7b7f7df78bef3f8a --- /dev/null +++ b/text-generation-webui/extensions/google_translate/script.py @@ -0,0 +1,46 @@ +import gradio as gr +from deep_translator import GoogleTranslator + +params = { + "language string": "ja", +} + +language_codes = {'Afrikaans': 'af', 'Albanian': 'sq', 'Amharic': 'am', 'Arabic': 'ar', 'Armenian': 'hy', 'Azerbaijani': 'az', 'Basque': 'eu', 'Belarusian': 'be', 'Bengali': 'bn', 'Bosnian': 'bs', 'Bulgarian': 'bg', 'Catalan': 'ca', 'Cebuano': 'ceb', 'Chinese (Simplified)': 'zh-CN', 'Chinese (Traditional)': 'zh-TW', 'Corsican': 'co', 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dutch': 'nl', 'English': 'en', 'Esperanto': 'eo', 'Estonian': 'et', 'Finnish': 'fi', 'French': 'fr', 'Frisian': 'fy', 'Galician': 'gl', 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Gujarati': 'gu', 'Haitian Creole': 'ht', 'Hausa': 'ha', 'Hawaiian': 'haw', 'Hebrew': 'iw', 'Hindi': 'hi', 'Hmong': 'hmn', 'Hungarian': 'hu', 'Icelandic': 'is', 'Igbo': 'ig', 'Indonesian': 'id', 'Irish': 'ga', 'Italian': 'it', 'Japanese': 'ja', 'Javanese': 'jw', 'Kannada': 'kn', 'Kazakh': 'kk', 'Khmer': 'km', 'Korean': 'ko', 'Kurdish': 'ku', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Latin': 'la', 'Latvian': 'lv', 'Lithuanian': 'lt', 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Malagasy': 'mg', 'Malay': 'ms', 'Malayalam': 'ml', 'Maltese': 'mt', 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Myanmar (Burmese)': 'my', 'Nepali': 'ne', 'Norwegian': 'no', 'Nyanja (Chichewa)': 'ny', 'Pashto': 'ps', 'Persian': 'fa', 'Polish': 'pl', 'Portuguese (Portugal, Brazil)': 'pt', 'Punjabi': 'pa', 'Romanian': 'ro', 'Russian': 'ru', 'Samoan': 'sm', 'Scots Gaelic': 'gd', 'Serbian': 'sr', 'Sesotho': 'st', 'Shona': 'sn', 'Sindhi': 'sd', 'Sinhala (Sinhalese)': 'si', 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'Spanish': 'es', 'Sundanese': 'su', 'Swahili': 'sw', 'Swedish': 'sv', 'Tagalog (Filipino)': 'tl', 'Tajik': 'tg', 'Tamil': 'ta', 'Telugu': 'te', 'Thai': 'th', 'Turkish': 'tr', 'Ukrainian': 'uk', 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi', 'Welsh': 'cy', 'Xhosa': 'xh', 'Yiddish': 'yi', 'Yoruba': 'yo', 'Zulu': 'zu'} + + +def input_modifier(string): + """ + This function is applied to your text inputs before + they are fed into the model. + """ + + return GoogleTranslator(source=params['language string'], target='en').translate(string) + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + + return GoogleTranslator(source='en', target=params['language string']).translate(string) + + +def bot_prefix_modifier(string): + """ + This function is only applied in chat mode. It modifies + the prefix text for the Bot and can be used to bias its + behavior. + """ + + return string + + +def ui(): + # Finding the language name from the language code to use as the default value + language_name = list(language_codes.keys())[list(language_codes.values()).index(params['language string'])] + + # Gradio elements + language = gr.Dropdown(value=language_name, choices=[k for k in language_codes], label='Language') + + # Event functions to update the parameters in the backend + language.change(lambda x: params.update({"language string": language_codes[x]}), language, None) diff --git a/text-generation-webui/extensions/llama_prompts/script.py b/text-generation-webui/extensions/llama_prompts/script.py new file mode 100644 index 0000000000000000000000000000000000000000..3365015659d490c2a77adbdc16eb12217dafee4c --- /dev/null +++ b/text-generation-webui/extensions/llama_prompts/script.py @@ -0,0 +1,21 @@ +import gradio as gr +import pandas as pd + +import modules.shared as shared + +df = pd.read_csv("https://raw.githubusercontent.com/devbrones/llama-prompts/main/prompts/prompts.csv") + + +def get_prompt_by_name(name): + if name == 'None': + return '' + else: + return df[df['Prompt name'] == name].iloc[0]['Prompt'].replace('\\n', '\n') + + +def ui(): + if not shared.is_chat(): + choices = ['None'] + list(df['Prompt name']) + + prompts_menu = gr.Dropdown(value=choices[0], choices=choices, label='Prompt') + prompts_menu.change(get_prompt_by_name, prompts_menu, shared.gradio['textbox']) diff --git a/text-generation-webui/extensions/sd_api_pictures/script.py b/text-generation-webui/extensions/sd_api_pictures/script.py new file mode 100644 index 0000000000000000000000000000000000000000..80a6027a1dcb8acfa5eff78a669fc7a7b61ee50e --- /dev/null +++ b/text-generation-webui/extensions/sd_api_pictures/script.py @@ -0,0 +1,189 @@ +import base64 +import io +import re +from pathlib import Path + +import gradio as gr +import requests +import torch +from PIL import Image + +from modules import chat, shared + +torch._C._jit_set_profiling_mode(False) + +# parameters which can be customized in settings.json of webui +params = { + 'enable_SD_api': False, + 'address': 'http://127.0.0.1:7860', + 'save_img': False, + 'SD_model': 'NeverEndingDream', # not really used right now + 'prompt_prefix': '(Masterpiece:1.1), (solo:1.3), detailed, intricate, colorful', + 'negative_prompt': '(worst quality, low quality:1.3)', + 'side_length': 512, + 'restore_faces': False +} + +SD_models = ['NeverEndingDream'] # TODO: get with http://{address}}/sdapi/v1/sd-models and allow user to select + +streaming_state = shared.args.no_stream # remember if chat streaming was enabled +picture_response = False # specifies if the next model response should appear as a picture +pic_id = 0 + + +def remove_surrounded_chars(string): + # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR + # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' + return re.sub('\*[^\*]*?(\*|$)', '', string) + +# I don't even need input_hijack for this as visible text will be commited to history as the unmodified string + + +def input_modifier(string): + """ + This function is applied to your text inputs before + they are fed into the model. + """ + global params, picture_response + if not params['enable_SD_api']: + return string + + commands = ['send', 'mail', 'me'] + mediums = ['image', 'pic', 'picture', 'photo'] + subjects = ['yourself', 'own'] + lowstr = string.lower() + + # TODO: refactor out to separate handler and also replace detection with a regexp + if any(command in lowstr for command in commands) and any(case in lowstr for case in mediums): # trigger the generation if a command signature and a medium signature is found + picture_response = True + shared.args.no_stream = True # Disable streaming cause otherwise the SD-generated picture would return as a dud + shared.processing_message = "*Is sending a picture...*" + string = "Please provide a detailed description of your surroundings, how you look and the situation you're in and what you are doing right now" + if any(target in lowstr for target in subjects): # the focus of the image should be on the sending character + string = "Please provide a detailed and vivid description of how you look and what you are wearing" + + return string + +# Get and save the Stable Diffusion-generated picture + + +def get_SD_pictures(description): + + global params, pic_id + + payload = { + "prompt": params['prompt_prefix'] + description, + "seed": -1, + "sampler_name": "DPM++ 2M Karras", + "steps": 32, + "cfg_scale": 7, + "width": params['side_length'], + "height": params['side_length'], + "restore_faces": params['restore_faces'], + "negative_prompt": params['negative_prompt'] + } + + response = requests.post(url=f'{params["address"]}/sdapi/v1/txt2img', json=payload) + r = response.json() + + visible_result = "" + for img_str in r['images']: + image = Image.open(io.BytesIO(base64.b64decode(img_str.split(",", 1)[0]))) + if params['save_img']: + output_file = Path(f'extensions/sd_api_pictures/outputs/{pic_id:06d}.png') + image.save(output_file.as_posix()) + pic_id += 1 + # lower the resolution of received images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history + image.thumbnail((300, 300)) + buffered = io.BytesIO() + image.save(buffered, format="JPEG") + buffered.seek(0) + image_bytes = buffered.getvalue() + img_str = "data:image/jpeg;base64," + base64.b64encode(image_bytes).decode() + visible_result = visible_result + f'{description}\n' + + return visible_result + +# TODO: how do I make the UI history ignore the resulting pictures (I don't want HTML to appear in history) +# and replace it with 'text' for the purposes of logging? + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + global pic_id, picture_response, streaming_state + + if not picture_response: + return string + + string = remove_surrounded_chars(string) + string = string.replace('"', '') + string = string.replace('“', '') + string = string.replace('\n', ' ') + string = string.strip() + + if string == '': + string = 'no viable description in reply, try regenerating' + + # I can't for the love of all that's holy get the name from shared.gradio['name1'], so for now it will be like this + text = f'*Description: "{string}"*' + + image = get_SD_pictures(string) + + picture_response = False + + shared.processing_message = "*Is typing...*" + shared.args.no_stream = streaming_state + return image + "\n" + text + + +def bot_prefix_modifier(string): + """ + This function is only applied in chat mode. It modifies + the prefix text for the Bot and can be used to bias its + behavior. + """ + + return string + + +def force_pic(): + global picture_response + picture_response = True + + +def ui(): + + # Gradio elements + with gr.Accordion("Stable Diffusion api integration", open=True): + with gr.Row(): + with gr.Column(): + enable = gr.Checkbox(value=params['enable_SD_api'], label='Activate SD Api integration') + save_img = gr.Checkbox(value=params['save_img'], label='Keep original received images in the outputs subdir') + with gr.Column(): + address = gr.Textbox(placeholder=params['address'], value=params['address'], label='Stable Diffusion host address') + + with gr.Row(): + force_btn = gr.Button("Force the next response to be a picture") + generate_now_btn = gr.Button("Generate an image response to the input") + + with gr.Accordion("Generation parameters", open=False): + prompt_prefix = gr.Textbox(placeholder=params['prompt_prefix'], value=params['prompt_prefix'], label='Prompt Prefix (best used to describe the look of the character)') + with gr.Row(): + negative_prompt = gr.Textbox(placeholder=params['negative_prompt'], value=params['negative_prompt'], label='Negative Prompt') + dimensions = gr.Slider(256, 702, value=params['side_length'], step=64, label='Image dimensions') + # model = gr.Dropdown(value=SD_models[0], choices=SD_models, label='Model') + + # Event functions to update the parameters in the backend + enable.change(lambda x: params.update({"enable_SD_api": x}), enable, None) + save_img.change(lambda x: params.update({"save_img": x}), save_img, None) + address.change(lambda x: params.update({"address": x}), address, None) + prompt_prefix.change(lambda x: params.update({"prompt_prefix": x}), prompt_prefix, None) + negative_prompt.change(lambda x: params.update({"negative_prompt": x}), negative_prompt, None) + dimensions.change(lambda x: params.update({"side_length": x}), dimensions, None) + # model.change(lambda x: params.update({"SD_model": x}), model, None) + + force_btn.click(force_pic) + generate_now_btn.click(force_pic) + generate_now_btn.click(chat.cai_chatbot_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream) diff --git a/text-generation-webui/extensions/send_pictures/script.py b/text-generation-webui/extensions/send_pictures/script.py new file mode 100644 index 0000000000000000000000000000000000000000..678592f5e511fed46f042b90f502fd004ee943e5 --- /dev/null +++ b/text-generation-webui/extensions/send_pictures/script.py @@ -0,0 +1,48 @@ +import base64 +from io import BytesIO + +import gradio as gr +import torch +from transformers import BlipForConditionalGeneration, BlipProcessor + +from modules import chat, shared + +# If 'state' is True, will hijack the next chat generation with +# custom input text given by 'value' in the format [text, visible_text] +input_hijack = { + 'state': False, + 'value': ["", ""] +} + +processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") +model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float32).to("cpu") + + +def caption_image(raw_image): + inputs = processor(raw_image.convert('RGB'), return_tensors="pt").to("cpu", torch.float32) + out = model.generate(**inputs, max_new_tokens=100) + return processor.decode(out[0], skip_special_tokens=True) + + +def generate_chat_picture(picture, name1, name2): + text = f'*{name1} sends {name2} a picture that contains the following: "{caption_image(picture)}"*' + # lower the resolution of sent images for the chat, otherwise the log size gets out of control quickly with all the base64 values in visible history + picture.thumbnail((300, 300)) + buffer = BytesIO() + picture.save(buffer, format="JPEG") + img_str = base64.b64encode(buffer.getvalue()).decode('utf-8') + visible_text = f'{text}' + return text, visible_text + + +def ui(): + picture_select = gr.Image(label='Send a picture', type='pil') + + # Prepare the hijack with custom inputs + picture_select.upload(lambda picture, name1, name2: input_hijack.update({"state": True, "value": generate_chat_picture(picture, name1, name2)}), [picture_select, shared.gradio['name1'], shared.gradio['name2']], None) + + # Call the generation function + picture_select.upload(chat.cai_chatbot_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream) + + # Clear the picture from the upload field + picture_select.upload(lambda: None, [], [picture_select], show_progress=False) diff --git a/text-generation-webui/extensions/silero_tts/outputs/outputs-will-be-saved-here.txt b/text-generation-webui/extensions/silero_tts/outputs/outputs-will-be-saved-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/extensions/silero_tts/requirements.txt b/text-generation-webui/extensions/silero_tts/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..1017bf0d7accb9930872ededd8a4bc077d393958 --- /dev/null +++ b/text-generation-webui/extensions/silero_tts/requirements.txt @@ -0,0 +1,5 @@ +ipython +num2words +omegaconf +pydub +PyYAML diff --git a/text-generation-webui/extensions/silero_tts/script.py b/text-generation-webui/extensions/silero_tts/script.py new file mode 100644 index 0000000000000000000000000000000000000000..23f86e39e623bbfb8bfd2f40b787682618962667 --- /dev/null +++ b/text-generation-webui/extensions/silero_tts/script.py @@ -0,0 +1,182 @@ +import time +from pathlib import Path + +import gradio as gr +import torch + +from extensions.silero_tts import tts_preprocessor +from modules import chat, shared +from modules.html_generator import chat_html_wrapper + +torch._C._jit_set_profiling_mode(False) + + +params = { + 'activate': True, + 'speaker': 'en_56', + 'language': 'en', + 'model_id': 'v3_en', + 'sample_rate': 48000, + 'device': 'cpu', + 'show_text': False, + 'autoplay': True, + 'voice_pitch': 'medium', + 'voice_speed': 'medium', + 'local_cache_path': '' # User can override the default cache path to something other via settings.json +} + +current_params = params.copy() +voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115'] +voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high'] +voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] +streaming_state = shared.args.no_stream # remember if chat streaming was enabled + +# Used for making text xml compatible, needed for voice pitch and speed control +table = str.maketrans({ + "<": "<", + ">": ">", + "&": "&", + "'": "'", + '"': """, +}) + + +def xmlesc(txt): + return txt.translate(table) + + +def load_model(): + torch_cache_path = torch.hub.get_dir() if params['local_cache_path'] == '' else params['local_cache_path'] + model_path = torch_cache_path + "/snakers4_silero-models_master/src/silero/model/" + params['model_id'] + ".pt" + if Path(model_path).is_file(): + print(f'\nUsing Silero TTS cached checkpoint found at {torch_cache_path}') + model, example_text = torch.hub.load(repo_or_dir=torch_cache_path + '/snakers4_silero-models_master/', model='silero_tts', language=params['language'], speaker=params['model_id'], source='local', path=model_path, force_reload=True) + else: + print(f'\nSilero TTS cache not found at {torch_cache_path}. Attempting to download...') + model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id']) + model.to(params['device']) + return model + + +def remove_tts_from_history(name1, name2, mode): + for i, entry in enumerate(shared.history['internal']): + shared.history['visible'][i] = [shared.history['visible'][i][0], entry[1]] + return chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def toggle_text_in_history(name1, name2, mode): + for i, entry in enumerate(shared.history['visible']): + visible_reply = entry[1] + if visible_reply.startswith('')[0]}\n\n{reply}"] + else: + shared.history['visible'][i] = [shared.history['visible'][i][0], f"{visible_reply.split('')[0]}"] + return chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def input_modifier(string): + """ + This function is applied to your text inputs before + they are fed into the model. + """ + + # Remove autoplay from the last reply + if shared.is_chat() and len(shared.history['internal']) > 0: + shared.history['visible'][-1] = [shared.history['visible'][-1][0], shared.history['visible'][-1][1].replace('controls autoplay>', 'controls>')] + + shared.processing_message = "*Is recording a voice message...*" + shared.args.no_stream = True # Disable streaming cause otherwise the audio output will stutter and begin anew every time the message is being updated + return string + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + + global model, current_params, streaming_state + + for i in params: + if params[i] != current_params[i]: + model = load_model() + current_params = params.copy() + break + + if not params['activate']: + return string + + original_string = string + string = tts_preprocessor.preprocess(string) + + if string == '': + string = '*Empty reply, try regenerating*' + else: + output_file = Path(f'extensions/silero_tts/outputs/{shared.character}_{int(time.time())}.wav') + prosody = ''.format(params['voice_speed'], params['voice_pitch']) + silero_input = f'{prosody}{xmlesc(string)}' + model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file)) + + autoplay = 'autoplay' if params['autoplay'] else '' + string = f'' + if params['show_text']: + string += f'\n\n{original_string}' + + shared.processing_message = "*Is typing...*" + shared.args.no_stream = streaming_state # restore the streaming option to the previous value + return string + + +def bot_prefix_modifier(string): + """ + This function is only applied in chat mode. It modifies + the prefix text for the Bot and can be used to bias its + behavior. + """ + + return string + + +def setup(): + global model + model = load_model() + + +def ui(): + # Gradio elements + with gr.Accordion("Silero TTS"): + with gr.Row(): + activate = gr.Checkbox(value=params['activate'], label='Activate TTS') + autoplay = gr.Checkbox(value=params['autoplay'], label='Play TTS automatically') + + show_text = gr.Checkbox(value=params['show_text'], label='Show message text under audio player') + voice = gr.Dropdown(value=params['speaker'], choices=voices_by_gender, label='TTS voice') + with gr.Row(): + v_pitch = gr.Dropdown(value=params['voice_pitch'], choices=voice_pitches, label='Voice pitch') + v_speed = gr.Dropdown(value=params['voice_speed'], choices=voice_speeds, label='Voice speed') + + with gr.Row(): + convert = gr.Button('Permanently replace audios with the message texts') + convert_cancel = gr.Button('Cancel', visible=False) + convert_confirm = gr.Button('Confirm (cannot be undone)', variant="stop", visible=False) + + # Convert history with confirmation + convert_arr = [convert_confirm, convert, convert_cancel] + convert.click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, convert_arr) + convert_confirm.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr) + convert_confirm.click(remove_tts_from_history, [shared.gradio[k] for k in ['name1', 'name2', 'Chat mode']], shared.gradio['display']) + convert_confirm.click(lambda: chat.save_history(timestamp=False), [], [], show_progress=False) + convert_cancel.click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, convert_arr) + + # Toggle message text in history + show_text.change(lambda x: params.update({"show_text": x}), show_text, None) + show_text.change(toggle_text_in_history, [shared.gradio[k] for k in ['name1', 'name2', 'Chat mode']], shared.gradio['display']) + show_text.change(lambda: chat.save_history(timestamp=False), [], [], show_progress=False) + + # Event functions to update the parameters in the backend + activate.change(lambda x: params.update({"activate": x}), activate, None) + autoplay.change(lambda x: params.update({"autoplay": x}), autoplay, None) + voice.change(lambda x: params.update({"speaker": x}), voice, None) + v_pitch.change(lambda x: params.update({"voice_pitch": x}), v_pitch, None) + v_speed.change(lambda x: params.update({"voice_speed": x}), v_speed, None) diff --git a/text-generation-webui/extensions/silero_tts/test_tts.py b/text-generation-webui/extensions/silero_tts/test_tts.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc2c102a9ef29f21141429232f957421989cdd4 --- /dev/null +++ b/text-generation-webui/extensions/silero_tts/test_tts.py @@ -0,0 +1,81 @@ +import time +from pathlib import Path + +import torch +import tts_preprocessor + +torch._C._jit_set_profiling_mode(False) + + +params = { + 'activate': True, + 'speaker': 'en_49', + 'language': 'en', + 'model_id': 'v3_en', + 'sample_rate': 48000, + 'device': 'cpu', + 'show_text': True, + 'autoplay': True, + 'voice_pitch': 'medium', + 'voice_speed': 'medium', +} + +current_params = params.copy() +voices_by_gender = ['en_99', 'en_45', 'en_18', 'en_117', 'en_49', 'en_51', 'en_68', 'en_0', 'en_26', 'en_56', 'en_74', 'en_5', 'en_38', 'en_53', 'en_21', 'en_37', 'en_107', 'en_10', 'en_82', 'en_16', 'en_41', 'en_12', 'en_67', 'en_61', 'en_14', 'en_11', 'en_39', 'en_52', 'en_24', 'en_97', 'en_28', 'en_72', 'en_94', 'en_36', 'en_4', 'en_43', 'en_88', 'en_25', 'en_65', 'en_6', 'en_44', 'en_75', 'en_91', 'en_60', 'en_109', 'en_85', 'en_101', 'en_108', 'en_50', 'en_96', 'en_64', 'en_92', 'en_76', 'en_33', 'en_116', 'en_48', 'en_98', 'en_86', 'en_62', 'en_54', 'en_95', 'en_55', 'en_111', 'en_3', 'en_83', 'en_8', 'en_47', 'en_59', 'en_1', 'en_2', 'en_7', 'en_9', 'en_13', 'en_15', 'en_17', 'en_19', 'en_20', 'en_22', 'en_23', 'en_27', 'en_29', 'en_30', 'en_31', 'en_32', 'en_34', 'en_35', 'en_40', 'en_42', 'en_46', 'en_57', 'en_58', 'en_63', 'en_66', 'en_69', 'en_70', 'en_71', 'en_73', 'en_77', 'en_78', 'en_79', 'en_80', 'en_81', 'en_84', 'en_87', 'en_89', 'en_90', 'en_93', 'en_100', 'en_102', 'en_103', 'en_104', 'en_105', 'en_106', 'en_110', 'en_112', 'en_113', 'en_114', 'en_115'] +voice_pitches = ['x-low', 'low', 'medium', 'high', 'x-high'] +voice_speeds = ['x-slow', 'slow', 'medium', 'fast', 'x-fast'] + +# Used for making text xml compatible, needed for voice pitch and speed control +table = str.maketrans({ + "<": "<", + ">": ">", + "&": "&", + "'": "'", + '"': """, +}) + + +def xmlesc(txt): + return txt.translate(table) + + +def load_model(): + model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=params['language'], speaker=params['model_id']) + model.to(params['device']) + return model + + +model = load_model() + + +def output_modifier(string): + """ + This function is applied to the model outputs. + """ + + global model, current_params + + original_string = string + string = tts_preprocessor.preprocess(string) + processed_string = string + + if string == '': + string = '*Empty reply, try regenerating*' + else: + output_file = Path(f'extensions/silero_tts/outputs/test_{int(time.time())}.wav') + prosody = ''.format(params['voice_speed'], params['voice_pitch']) + silero_input = f'{prosody}{xmlesc(string)}' + model.save_wav(ssml_text=silero_input, speaker=params['speaker'], sample_rate=int(params['sample_rate']), audio_path=str(output_file)) + + autoplay = 'autoplay' if params['autoplay'] else '' + string = f'' + + if params['show_text']: + string += f'\n\n{original_string}\n\nProcessed:\n{processed_string}' + + print(string) + + +if __name__ == '__main__': + import sys + output_modifier(sys.argv[1]) diff --git a/text-generation-webui/extensions/silero_tts/tts_preprocessor.py b/text-generation-webui/extensions/silero_tts/tts_preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..da163230d669674d87594d5b934c8f2bb7df7bea --- /dev/null +++ b/text-generation-webui/extensions/silero_tts/tts_preprocessor.py @@ -0,0 +1,194 @@ +import re + +from num2words import num2words + +punctuation = r'[\s,.?!/)\'\]>]' +alphabet_map = { + "A": " Ei ", + "B": " Bee ", + "C": " See ", + "D": " Dee ", + "E": " Eee ", + "F": " Eff ", + "G": " Jee ", + "H": " Eich ", + "I": " Eye ", + "J": " Jay ", + "K": " Kay ", + "L": " El ", + "M": " Emm ", + "N": " Enn ", + "O": " Ohh ", + "P": " Pee ", + "Q": " Queue ", + "R": " Are ", + "S": " Ess ", + "T": " Tee ", + "U": " You ", + "V": " Vee ", + "W": " Double You ", + "X": " Ex ", + "Y": " Why ", + "Z": " Zed " # Zed is weird, as I (da3dsoul) am American, but most of the voice models sound British, so it matches +} + + +def preprocess(string): + # the order for some of these matter + # For example, you need to remove the commas in numbers before expanding them + string = remove_surrounded_chars(string) + string = string.replace('"', '') + string = string.replace('\u201D', '').replace('\u201C', '') # right and left quote + string = string.replace('\u201F', '') # italic looking quote + string = string.replace('\n', ' ') + string = convert_num_locale(string) + string = replace_negative(string) + string = replace_roman(string) + string = hyphen_range_to(string) + string = num_to_words(string) + + # TODO Try to use a ML predictor to expand abbreviations. It's hard, dependent on context, and whether to actually + # try to say the abbreviation or spell it out as I've done below is not agreed upon + + # For now, expand abbreviations to pronunciations + # replace_abbreviations adds a lot of unnecessary whitespace to ensure separation + string = replace_abbreviations(string) + string = replace_lowercase_abbreviations(string) + + # cleanup whitespaces + # remove whitespace before punctuation + string = re.sub(rf'\s+({punctuation})', r'\1', string) + string = string.strip() + # compact whitespace + string = ' '.join(string.split()) + + return string + + +def remove_surrounded_chars(string): + # this expression matches to 'as few symbols as possible (0 upwards) between any asterisks' OR + # 'as few symbols as possible (0 upwards) between an asterisk and the end of the string' + return re.sub(r'\*[^*]*?(\*|$)', '', string) + + +def convert_num_locale(text): + # This detects locale and converts it to American without comma separators + pattern = re.compile(r'(?:\s|^)\d{1,3}(?:\.\d{3})+(,\d+)(?:\s|$)') + result = text + while True: + match = pattern.search(result) + if match is None: + break + + start = match.start() + end = match.end() + result = result[0:start] + result[start:end].replace('.', '').replace(',', '.') + result[end:len(result)] + + # removes comma separators from existing American numbers + pattern = re.compile(r'(\d),(\d)') + result = pattern.sub(r'\1\2', result) + + return result + + +def replace_negative(string): + # handles situations like -5. -5 would become negative 5, which would then be expanded to negative five + return re.sub(rf'(\s)(-)(\d+)({punctuation})', r'\1negative \3\4', string) + + +def replace_roman(string): + # find a string of roman numerals. + # Only 2 or more, to avoid capturing I and single character abbreviations, like names + pattern = re.compile(rf'\s[IVXLCDM]{{2,}}{punctuation}') + result = string + while True: + match = pattern.search(result) + if match is None: + break + + start = match.start() + end = match.end() + result = result[0:start + 1] + str(roman_to_int(result[start + 1:end - 1])) + result[end - 1:len(result)] + + return result + + +def roman_to_int(s): + rom_val = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000} + int_val = 0 + for i in range(len(s)): + if i > 0 and rom_val[s[i]] > rom_val[s[i - 1]]: + int_val += rom_val[s[i]] - 2 * rom_val[s[i - 1]] + else: + int_val += rom_val[s[i]] + return int_val + + +def hyphen_range_to(text): + pattern = re.compile(r'(\d+)[-–](\d+)') + result = pattern.sub(lambda x: x.group(1) + ' to ' + x.group(2), text) + return result + + +def num_to_words(text): + # 1000 or 10.23 + pattern = re.compile(r'\d+\.\d+|\d+') + result = pattern.sub(lambda x: num2words(float(x.group())), text) + return result + + +def replace_abbreviations(string): + # abbreviations 1 to 4 characters long. It will get things like A and I, but those are pronounced with their letter + pattern = re.compile(rf'(^|[\s(.\'\[<])([A-Z]{{1,4}})({punctuation}|$)') + result = string + while True: + match = pattern.search(result) + if match is None: + break + + start = match.start() + end = match.end() + result = result[0:start] + replace_abbreviation(result[start:end]) + result[end:len(result)] + + return result + + +def replace_lowercase_abbreviations(string): + # abbreviations 1 to 4 characters long, separated by dots i.e. e.g. + pattern = re.compile(rf'(^|[\s(.\'\[<])(([a-z]\.){{1,4}})({punctuation}|$)') + result = string + while True: + match = pattern.search(result) + if match is None: + break + + start = match.start() + end = match.end() + result = result[0:start] + replace_abbreviation(result[start:end].upper()) + result[end:len(result)] + + return result + + +def replace_abbreviation(string): + result = "" + for char in string: + result += match_mapping(char) + + return result + + +def match_mapping(char): + for mapping in alphabet_map.keys(): + if char == mapping: + return alphabet_map[char] + + return char + + +def __main__(args): + print(preprocess(args[1])) + + +if __name__ == "__main__": + import sys + __main__(sys.argv) diff --git a/text-generation-webui/extensions/whisper_stt/requirements.txt b/text-generation-webui/extensions/whisper_stt/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..770c38bba8a61d58eff8dd6c068a070110d6a385 --- /dev/null +++ b/text-generation-webui/extensions/whisper_stt/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/Uberi/speech_recognition.git@010382b +openai-whisper +soundfile +ffmpeg diff --git a/text-generation-webui/extensions/whisper_stt/script.py b/text-generation-webui/extensions/whisper_stt/script.py new file mode 100644 index 0000000000000000000000000000000000000000..6ef60c57a5c4495b6aef59fbc4f9d5d2a5362f46 --- /dev/null +++ b/text-generation-webui/extensions/whisper_stt/script.py @@ -0,0 +1,54 @@ +import gradio as gr +import speech_recognition as sr + +input_hijack = { + 'state': False, + 'value': ["", ""] +} + + +def do_stt(audio, text_state=""): + transcription = "" + r = sr.Recognizer() + + # Convert to AudioData + audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) + + try: + transcription = r.recognize_whisper(audio_data, language="english", model="base.en") + except sr.UnknownValueError: + print("Whisper could not understand audio") + except sr.RequestError as e: + print("Could not request results from Whisper", e) + + input_hijack.update({"state": True, "value": [transcription, transcription]}) + + text_state += transcription + " " + return text_state, text_state + + +def update_hijack(val): + input_hijack.update({"state": True, "value": [val, val]}) + return val + + +def auto_transcribe(audio, audio_auto, text_state=""): + if audio is None: + return "", "" + if audio_auto: + return do_stt(audio, text_state) + return "", "" + + +def ui(): + tr_state = gr.State(value="") + output_transcription = gr.Textbox(label="STT-Input", + placeholder="Speech Preview. Click \"Generate\" to send", + interactive=True) + output_transcription.change(fn=update_hijack, inputs=[output_transcription], outputs=[tr_state]) + audio_auto = gr.Checkbox(label="Auto-Transcribe", value=True) + with gr.Row(): + audio = gr.Audio(source="microphone") + audio.change(fn=auto_transcribe, inputs=[audio, audio_auto, tr_state], outputs=[output_transcription, tr_state]) + transcribe_button = gr.Button(value="Transcribe") + transcribe_button.click(do_stt, inputs=[audio, tr_state], outputs=[output_transcription, tr_state]) diff --git a/text-generation-webui/loras/place-your-loras-here.txt b/text-generation-webui/loras/place-your-loras-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/models/llama-7b/config.json b/text-generation-webui/models/llama-7b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ef0484f4fd8c6e39de6de5ef740b3cff866dcfaa --- /dev/null +++ b/text-generation-webui/models/llama-7b/config.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "pad_token_id": 0, + "rms_norm_eps": 1e-06, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.28.0.dev0", + "use_cache": true, + "vocab_size": 32000 +} diff --git a/text-generation-webui/models/llama-7b/generation_config.json b/text-generation-webui/models/llama-7b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..55d7b5b6db760f8c1963be3d56a3bc363bacdfb1 --- /dev/null +++ b/text-generation-webui/models/llama-7b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.28.0.dev0" +} diff --git a/text-generation-webui/models/llama-7b/model-00001-of-00002.safetensors b/text-generation-webui/models/llama-7b/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b6dcbd92c05cd25445a4130a39771f1626c2ad16 --- /dev/null +++ b/text-generation-webui/models/llama-7b/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f15a919bdb38ad81574d399c95053bbf44ae69440a9a29f4d965b691c957b54 +size 9976578930 diff --git a/text-generation-webui/models/llama-7b/model-00002-of-00002.safetensors b/text-generation-webui/models/llama-7b/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..037cc9d1bd2ac3201f11aa192084d933f5786f5a --- /dev/null +++ b/text-generation-webui/models/llama-7b/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e162edeaa58f65288ed886f79e89b9f777df435025e58632f821953e35295415 +size 3500297344 diff --git a/text-generation-webui/models/llama-7b/model.safetensors.index.json b/text-generation-webui/models/llama-7b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8b6245796e966e50960a317e4a54aa7bf73b0186 --- /dev/null +++ b/text-generation-webui/models/llama-7b/model.safetensors.index.json @@ -0,0 +1,330 @@ +{ + "metadata": { + "total_size": 13476839424 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/text-generation-webui/models/llama-7b/special_tokens_map.json b/text-generation-webui/models/llama-7b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/text-generation-webui/models/llama-7b/special_tokens_map.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/text-generation-webui/models/llama-7b/tokenizer.model b/text-generation-webui/models/llama-7b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/text-generation-webui/models/llama-7b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/text-generation-webui/models/llama-7b/tokenizer_config.json b/text-generation-webui/models/llama-7b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a54b01aa3699f19e1aea416fc337f910f60c6839 --- /dev/null +++ b/text-generation-webui/models/llama-7b/tokenizer_config.json @@ -0,0 +1 @@ +{"bos_token": "", "eos_token": "", "model_max_length": 1000000000000000019884624838656, "tokenizer_class": "LlamaTokenizer", "unk_token": ""} \ No newline at end of file diff --git a/text-generation-webui/models/place-your-models-here.txt b/text-generation-webui/models/place-your-models-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/modules/GPTQ_loader.py b/text-generation-webui/modules/GPTQ_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..3f42e5c6d37be89fc39d6721d72e77500472482a --- /dev/null +++ b/text-generation-webui/modules/GPTQ_loader.py @@ -0,0 +1,155 @@ +import inspect +import re +import sys +from pathlib import Path + +import accelerate +import torch +import transformers +from transformers import AutoConfig, AutoModelForCausalLM + +import modules.shared as shared + +sys.path.insert(0, str(Path("repositories/GPTQ-for-LLaMa"))) +import llama_inference_offload +from modelutils import find_layers +from quant import make_quant + + +def _load_quant(model, checkpoint, wbits, groupsize=-1, faster_kernel=False, exclude_layers=['lm_head'], kernel_switch_threshold=128): + + def noop(*args, **kwargs): + pass + + config = AutoConfig.from_pretrained(model) + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = AutoModelForCausalLM.from_config(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in exclude_layers: + if name in layers: + del layers[name] + + gptq_args = inspect.getfullargspec(make_quant).args + + make_quant_kwargs = { + 'module': model, + 'names': layers, + 'bits': wbits, + } + if 'groupsize' in gptq_args: + make_quant_kwargs['groupsize'] = groupsize + if 'faster' in gptq_args: + make_quant_kwargs['faster'] = faster_kernel + if 'kernel_switch_threshold' in gptq_args: + make_quant_kwargs['kernel_switch_threshold'] = kernel_switch_threshold + + make_quant(**make_quant_kwargs) + + del layers + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint), strict=False) + else: + model.load_state_dict(torch.load(checkpoint), strict=False) + model.seqlen = 2048 + print('Done.') + + return model + + +def load_quantized(model_name): + if not shared.args.model_type: + # Try to determine model type from model name + name = model_name.lower() + if any((k in name for k in ['llama', 'alpaca', 'vicuna'])): + model_type = 'llama' + elif any((k in name for k in ['opt-', 'galactica'])): + model_type = 'opt' + elif any((k in name for k in ['gpt-j', 'pygmalion-6b'])): + model_type = 'gptj' + else: + print("Can't determine model type from model name. Please specify it manually using --model_type " + "argument") + exit() + else: + model_type = shared.args.model_type.lower() + + if shared.args.pre_layer and model_type == 'llama': + load_quant = llama_inference_offload.load_quant + elif model_type in ('llama', 'opt', 'gptj'): + if shared.args.pre_layer: + print("Warning: ignoring --pre_layer because it only works for llama model type.") + load_quant = _load_quant + else: + print("Unknown pre-quantized model type specified. Only 'llama', 'opt' and 'gptj' are supported") + exit() + + # Now we are going to try to locate the quantized model file. + path_to_model = Path(f'{shared.args.model_dir}/{model_name}') + found_pts = list(path_to_model.glob("*.pt")) + found_safetensors = list(path_to_model.glob("*.safetensors")) + pt_path = None + + if len(found_pts) == 1: + pt_path = found_pts[0] + elif len(found_safetensors) == 1: + pt_path = found_safetensors[0] + else: + if path_to_model.name.lower().startswith('llama-7b'): + pt_model = f'llama-7b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-13b'): + pt_model = f'llama-13b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-30b'): + pt_model = f'llama-30b-{shared.args.wbits}bit' + elif path_to_model.name.lower().startswith('llama-65b'): + pt_model = f'llama-65b-{shared.args.wbits}bit' + else: + pt_model = f'{model_name}-{shared.args.wbits}bit' + + # Try to find the .safetensors or .pt both in the model dir and in the subfolder + for path in [Path(p + ext) for ext in ['.safetensors', '.pt'] for p in [f"{shared.args.model_dir}/{pt_model}", f"{path_to_model}/{pt_model}"]]: + if path.exists(): + print(f"Found {path}") + pt_path = path + break + + if not pt_path: + print("Could not find the quantized model in .pt or .safetensors format, exiting...") + exit() + + # qwopqwop200's offload + if model_type == 'llama' and shared.args.pre_layer: + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, shared.args.pre_layer) + else: + threshold = False if model_type == 'gptj' else 128 + model = load_quant(str(path_to_model), str(pt_path), shared.args.wbits, shared.args.groupsize, kernel_switch_threshold=threshold) + + # accelerate offload (doesn't work properly) + if shared.args.gpu_memory: + memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory)) + max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' + max_memory = {} + for i in range(len(memory_map)): + max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i] + max_memory['cpu'] = max_cpu_memory + + device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"]) + print("Using the following device map for the 4-bit model:", device_map) + # https://huggingface.co./docs/accelerate/package_reference/big_modeling#accelerate.dispatch_model + model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True) + + # No offload + elif not shared.args.cpu: + model = model.to(torch.device('cuda:0')) + + return model diff --git a/text-generation-webui/modules/LoRA.py b/text-generation-webui/modules/LoRA.py new file mode 100644 index 0000000000000000000000000000000000000000..17dd722968b752128e70af5d8cf811b8589a1ba7 --- /dev/null +++ b/text-generation-webui/modules/LoRA.py @@ -0,0 +1,43 @@ +from pathlib import Path + +import torch +from peft import PeftModel + +import modules.shared as shared +from modules.models import load_model +from modules.text_generation import clear_torch_cache + + +def reload_model(): + shared.model = shared.tokenizer = None + clear_torch_cache() + shared.model, shared.tokenizer = load_model(shared.model_name) + + +def add_lora_to_model(lora_name): + + # If a LoRA had been previously loaded, or if we want + # to unload a LoRA, reload the model + if shared.lora_name not in ['None', ''] or lora_name in ['None', '']: + reload_model() + shared.lora_name = lora_name + + if lora_name not in ['None', '']: + print(f"Adding the LoRA {lora_name} to the model...") + params = {} + if not shared.args.cpu: + params['dtype'] = shared.model.dtype + if hasattr(shared.model, "hf_device_map"): + params['device_map'] = {"base_model.model." + k: v for k, v in shared.model.hf_device_map.items()} + elif shared.args.load_in_8bit: + params['device_map'] = {'': 0} + + shared.model = PeftModel.from_pretrained(shared.model, Path(f"{shared.args.lora_dir}/{lora_name}"), **params) + if not shared.args.load_in_8bit and not shared.args.cpu: + shared.model.half() + if not hasattr(shared.model, "hf_device_map"): + if torch.has_mps: + device = torch.device('mps') + shared.model = shared.model.to(device) + else: + shared.model = shared.model.cuda() diff --git a/text-generation-webui/modules/RWKV.py b/text-generation-webui/modules/RWKV.py new file mode 100644 index 0000000000000000000000000000000000000000..0405230eee3cae31c1b33491dff38e10c02b623b --- /dev/null +++ b/text-generation-webui/modules/RWKV.py @@ -0,0 +1,75 @@ +import os +from pathlib import Path + +import numpy as np +from tokenizers import Tokenizer + +import modules.shared as shared +from modules.callbacks import Iteratorize + +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +os.environ['RWKV_JIT_ON'] = '1' +os.environ["RWKV_CUDA_ON"] = '1' if shared.args.rwkv_cuda_on else '0' # use CUDA kernel for seq mode (much faster) + +from rwkv.model import RWKV +from rwkv.utils import PIPELINE, PIPELINE_ARGS + + +class RWKVModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path, dtype="fp16", device="cuda"): + tokenizer_path = Path(f"{path.parent}/20B_tokenizer.json") + + if shared.args.rwkv_strategy is None: + model = RWKV(model=str(path), strategy=f'{device} {dtype}') + else: + model = RWKV(model=str(path), strategy=shared.args.rwkv_strategy) + pipeline = PIPELINE(model, str(tokenizer_path)) + + result = self() + result.pipeline = pipeline + return result + + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=None, alpha_frequency=0.1, alpha_presence=0.1, token_ban=[0], token_stop=[], callback=None): + args = PIPELINE_ARGS( + temperature=temperature, + top_p=top_p, + top_k=top_k, + alpha_frequency=alpha_frequency, # Frequency Penalty (as in GPT-3) + alpha_presence=alpha_presence, # Presence Penalty (as in GPT-3) + token_ban=token_ban, # ban the generation of some tokens + token_stop=token_stop + ) + + return self.pipeline.generate(context, token_count=token_count, args=args, callback=callback) + + def generate_with_streaming(self, **kwargs): + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = '' + for token in generator: + reply += token + yield reply + + +class RWKVTokenizer: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path): + tokenizer_path = path / "20B_tokenizer.json" + tokenizer = Tokenizer.from_file(str(tokenizer_path)) + + result = self() + result.tokenizer = tokenizer + return result + + def encode(self, prompt): + return self.tokenizer.encode(prompt).ids + + def decode(self, ids): + return self.tokenizer.decode(ids) diff --git a/text-generation-webui/modules/__pycache__/LoRA.cpython-310.pyc b/text-generation-webui/modules/__pycache__/LoRA.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bd3e9ced2c7c064127742d1d4c2916ecfb0a509 Binary files /dev/null and b/text-generation-webui/modules/__pycache__/LoRA.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/api.cpython-310.pyc b/text-generation-webui/modules/__pycache__/api.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d48053231163b672d22b5fbfc20c443587a601b Binary files /dev/null and b/text-generation-webui/modules/__pycache__/api.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/callbacks.cpython-310.pyc b/text-generation-webui/modules/__pycache__/callbacks.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ff67401d6e9ab228ab82ee8533b331aaa42ea9a Binary files /dev/null and b/text-generation-webui/modules/__pycache__/callbacks.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/chat.cpython-310.pyc b/text-generation-webui/modules/__pycache__/chat.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a81f4ad3233cf710fc630497ea393b6f1a1fc94f Binary files /dev/null and b/text-generation-webui/modules/__pycache__/chat.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/extensions.cpython-310.pyc b/text-generation-webui/modules/__pycache__/extensions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..637b4979e8e81608226f96be379beef65b0d3715 Binary files /dev/null and b/text-generation-webui/modules/__pycache__/extensions.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/html_generator.cpython-310.pyc b/text-generation-webui/modules/__pycache__/html_generator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a663d684daa8c671748aa2f70dfe83f84a05ade Binary files /dev/null and b/text-generation-webui/modules/__pycache__/html_generator.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/models.cpython-310.pyc b/text-generation-webui/modules/__pycache__/models.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ef171a311e9c0c7e37ea5f538d175e216d6e33d Binary files /dev/null and b/text-generation-webui/modules/__pycache__/models.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/shared.cpython-310.pyc b/text-generation-webui/modules/__pycache__/shared.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..75a689456ca003f3771686c7fa4e464a8c279f1e Binary files /dev/null and b/text-generation-webui/modules/__pycache__/shared.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/text_generation.cpython-310.pyc b/text-generation-webui/modules/__pycache__/text_generation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2dec172b091346cbd59c2d29ede02fc8e19fcc81 Binary files /dev/null and b/text-generation-webui/modules/__pycache__/text_generation.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/training.cpython-310.pyc b/text-generation-webui/modules/__pycache__/training.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6df03b2631b186124a558449fbee9ab76f84bc65 Binary files /dev/null and b/text-generation-webui/modules/__pycache__/training.cpython-310.pyc differ diff --git a/text-generation-webui/modules/__pycache__/ui.cpython-310.pyc b/text-generation-webui/modules/__pycache__/ui.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b47a239d44d9db7cb2f57c94dde7fb5da3aec1b6 Binary files /dev/null and b/text-generation-webui/modules/__pycache__/ui.cpython-310.pyc differ diff --git a/text-generation-webui/modules/api.py b/text-generation-webui/modules/api.py new file mode 100644 index 0000000000000000000000000000000000000000..f18ad4cfca3ed8f6f8fc0df34cfa0b762f1ed3b9 --- /dev/null +++ b/text-generation-webui/modules/api.py @@ -0,0 +1,39 @@ +import json + +import gradio as gr + +from modules import shared +from modules.text_generation import generate_reply + + +def generate_reply_wrapper(string): + generate_params = { + 'do_sample': True, + 'temperature': 1, + 'top_p': 1, + 'typical_p': 1, + 'repetition_penalty': 1, + 'encoder_repetition_penalty': 1, + 'top_k': 50, + 'num_beams': 1, + 'penalty_alpha': 0, + 'min_length': 0, + 'length_penalty': 1, + 'no_repeat_ngram_size': 0, + 'early_stopping': False, + } + params = json.loads(string) + for k in params[1]: + generate_params[k] = params[1][k] + for i in generate_reply(params[0], generate_params): + yield i + + +def create_apis(): + t1 = gr.Textbox(visible=False) + t2 = gr.Textbox(visible=False) + dummy = gr.Button(visible=False) + + input_params = [t1] + output_params = [t2] + [shared.gradio[k] for k in ['markdown', 'html']] + dummy.click(generate_reply_wrapper, input_params, output_params, api_name='textgen') diff --git a/text-generation-webui/modules/callbacks.py b/text-generation-webui/modules/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..51ecbdd72d8cb471df8aaeab6859515d73e188c8 --- /dev/null +++ b/text-generation-webui/modules/callbacks.py @@ -0,0 +1,105 @@ +import gc +import traceback +from queue import Queue +from threading import Thread + +import torch +import transformers + +import modules.shared as shared + + +# Copied from https://github.com/PygmalionAI/gradio-ui/ +class _SentinelTokenStoppingCriteria(transformers.StoppingCriteria): + + def __init__(self, sentinel_token_ids: list, starting_idx: int): + transformers.StoppingCriteria.__init__(self) + self.sentinel_token_ids = sentinel_token_ids + self.starting_idx = starting_idx + + def __call__(self, input_ids: torch.LongTensor, _scores: torch.FloatTensor) -> bool: + for sample in input_ids: + trimmed_sample = sample[self.starting_idx:] + + for i in range(len(self.sentinel_token_ids)): + # Can't unfold, output is still too tiny. Skip. + if trimmed_sample.shape[-1] < self.sentinel_token_ids[i].shape[-1]: + continue + for window in trimmed_sample.unfold(0, self.sentinel_token_ids[i].shape[-1], 1): + if torch.all(torch.eq(self.sentinel_token_ids[i][0], window)): + return True + return False + + +class Stream(transformers.StoppingCriteria): + def __init__(self, callback_func=None): + self.callback_func = callback_func + + def __call__(self, input_ids, scores) -> bool: + if self.callback_func is not None: + self.callback_func(input_ids[0]) + return False + + +class Iteratorize: + + """ + Transforms a function that takes a callback + into a lazy iterator (generator). + """ + + def __init__(self, func, kwargs={}, callback=None): + self.mfunc = func + self.c_callback = callback + self.q = Queue() + self.sentinel = object() + self.kwargs = kwargs + self.stop_now = False + + def _callback(val): + if self.stop_now or shared.stop_everything: + raise ValueError + self.q.put(val) + + def gentask(): + try: + ret = self.mfunc(callback=_callback, **self.kwargs) + except ValueError: + pass + except: + traceback.print_exc() + pass + + clear_torch_cache() + self.q.put(self.sentinel) + if self.c_callback: + self.c_callback(ret) + + self.thread = Thread(target=gentask) + self.thread.start() + + def __iter__(self): + return self + + def __next__(self): + obj = self.q.get(True, None) + if obj is self.sentinel: + raise StopIteration + else: + return obj + + def __del__(self): + clear_torch_cache() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop_now = True + clear_torch_cache() + + +def clear_torch_cache(): + gc.collect() + if not shared.args.cpu: + torch.cuda.empty_cache() diff --git a/text-generation-webui/modules/chat.py b/text-generation-webui/modules/chat.py new file mode 100644 index 0000000000000000000000000000000000000000..06bd8785ff889d8f1c4a11eed7d3943ad88a0e36 --- /dev/null +++ b/text-generation-webui/modules/chat.py @@ -0,0 +1,471 @@ +import base64 +import copy +import io +import json +import re +from datetime import datetime +from pathlib import Path + +import yaml +from PIL import Image + +import modules.extensions as extensions_module +import modules.shared as shared +from modules.extensions import apply_extensions +from modules.html_generator import (chat_html_wrapper, fix_newlines, + make_thumbnail) +from modules.text_generation import (encode, generate_reply, + get_max_prompt_length) + + +def generate_chat_prompt(user_input, max_new_tokens, name1, name2, context, chat_prompt_size, **kwargs): + is_instruct = kwargs['is_instruct'] if 'is_instruct' in kwargs else False + end_of_turn = kwargs['end_of_turn'] if 'end_of_turn' in kwargs else '' + impersonate = kwargs['impersonate'] if 'impersonate' in kwargs else False + also_return_rows = kwargs['also_return_rows'] if 'also_return_rows' in kwargs else False + rows = [f"{context.strip()}\n"] + + # Finding the maximum prompt size + if shared.soft_prompt: + chat_prompt_size -= shared.soft_prompt_tensor.shape[1] + max_length = min(get_max_prompt_length(max_new_tokens), chat_prompt_size) + + if is_instruct: + prefix1 = f"{name1}\n" + prefix2 = f"{name2}\n" + else: + prefix1 = f"{name1}: " + prefix2 = f"{name2}: " + + i = len(shared.history['internal']) - 1 + while i >= 0 and len(encode(''.join(rows), max_new_tokens)[0]) < max_length: + rows.insert(1, f"{prefix2}{shared.history['internal'][i][1].strip()}{end_of_turn}\n") + string = shared.history['internal'][i][0] + if string not in ['', '<|BEGIN-VISIBLE-CHAT|>']: + rows.insert(1, f"{prefix1}{string.strip()}{end_of_turn}\n") + i -= 1 + + if impersonate: + rows.append(f"{prefix1.strip() if not is_instruct else prefix1}") + limit = 2 + else: + # Adding the user message + user_input = fix_newlines(user_input) + if len(user_input) > 0: + rows.append(f"{prefix1}{user_input}{end_of_turn}\n") + + # Adding the Character prefix + rows.append(apply_extensions(f"{prefix2.strip() if not is_instruct else prefix2}", "bot_prefix")) + limit = 3 + + while len(rows) > limit and len(encode(''.join(rows), max_new_tokens)[0]) >= max_length: + rows.pop(1) + prompt = ''.join(rows) + + if also_return_rows: + return prompt, rows + else: + return prompt + + +def extract_message_from_reply(reply, name1, name2, stop_at_newline): + next_character_found = False + + if stop_at_newline: + lines = reply.split('\n') + reply = lines[0].strip() + if len(lines) > 1: + next_character_found = True + else: + for string in [f"\n{name1}:", f"\n{name2}:"]: + idx = reply.find(string) + if idx != -1: + reply = reply[:idx] + next_character_found = True + + # If something like "\nYo" is generated just before "\nYou:" + # is completed, trim it + if not next_character_found: + for string in [f"\n{name1}:", f"\n{name2}:"]: + for j in range(len(string) - 1, 0, -1): + if reply[-j:] == string[:j]: + reply = reply[:-j] + break + else: + continue + break + + reply = fix_newlines(reply) + return reply, next_character_found + + +def chatbot_wrapper(text, generate_state, name1, name2, context, mode, end_of_turn, regenerate=False): + if mode == 'instruct': + stopping_strings = [f"\n{name1}", f"\n{name2}"] + else: + stopping_strings = [f"\n{name1}:", f"\n{name2}:"] + + # Defining some variables + cumulative_reply = '' + just_started = True + name1_original = name1 + visible_text = custom_generate_chat_prompt = None + eos_token = '\n' if generate_state['stop_at_newline'] else None + if 'pygmalion' in shared.model_name.lower(): + name1 = "You" + + # Check if any extension wants to hijack this function call + for extension, _ in extensions_module.iterator(): + if hasattr(extension, 'input_hijack') and extension.input_hijack['state']: + extension.input_hijack['state'] = False + text, visible_text = extension.input_hijack['value'] + if custom_generate_chat_prompt is None and hasattr(extension, 'custom_generate_chat_prompt'): + custom_generate_chat_prompt = extension.custom_generate_chat_prompt + + if visible_text is None: + visible_text = text + text = apply_extensions(text, "input") + + # Generating the prompt + kwargs = {'end_of_turn': end_of_turn, 'is_instruct': mode == 'instruct'} + if custom_generate_chat_prompt is None: + prompt = generate_chat_prompt(text, generate_state['max_new_tokens'], name1, name2, context, generate_state['chat_prompt_size'], **kwargs) + else: + prompt = custom_generate_chat_prompt(text, generate_state['max_new_tokens'], name1, name2, context, generate_state['chat_prompt_size'], **kwargs) + + # Yield *Is typing...* + if not regenerate: + yield shared.history['visible'] + [[visible_text, shared.processing_message]] + + # Generate + for i in range(generate_state['chat_generation_attempts']): + reply = None + for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", generate_state, eos_token=eos_token, stopping_strings=stopping_strings): + reply = cumulative_reply + reply + + # Extracting the reply + reply, next_character_found = extract_message_from_reply(reply, name1, name2, generate_state['stop_at_newline']) + visible_reply = re.sub("(||{{user}})", name1_original, reply) + visible_reply = apply_extensions(visible_reply, "output") + + # We need this global variable to handle the Stop event, + # otherwise gradio gets confused + if shared.stop_everything: + return shared.history['visible'] + if just_started: + just_started = False + shared.history['internal'].append(['', '']) + shared.history['visible'].append(['', '']) + + shared.history['internal'][-1] = [text, reply] + shared.history['visible'][-1] = [visible_text, visible_reply] + if not shared.args.no_stream: + yield shared.history['visible'] + if next_character_found: + break + + if reply is not None: + cumulative_reply = reply + + yield shared.history['visible'] + + +def impersonate_wrapper(text, generate_state, name1, name2, context, mode, end_of_turn): + if mode == 'instruct': + stopping_strings = [f"\n{name1}", f"\n{name2}"] + else: + stopping_strings = [f"\n{name1}:", f"\n{name2}:"] + + # Defining some variables + cumulative_reply = '' + eos_token = '\n' if generate_state['stop_at_newline'] else None + if 'pygmalion' in shared.model_name.lower(): + name1 = "You" + + prompt = generate_chat_prompt(text, generate_state['max_new_tokens'], name1, name2, context, generate_state['chat_prompt_size'], impersonate=True, end_of_turn=end_of_turn) + + # Yield *Is typing...* + yield shared.processing_message + + for i in range(generate_state['chat_generation_attempts']): + reply = None + for reply in generate_reply(f"{prompt}{' ' if len(cumulative_reply) > 0 else ''}{cumulative_reply}", generate_state, eos_token=eos_token, stopping_strings=stopping_strings): + reply = cumulative_reply + reply + reply, next_character_found = extract_message_from_reply(reply, name1, name2, generate_state['stop_at_newline']) + yield reply + if next_character_found: + break + + if reply is not None: + cumulative_reply = reply + + yield reply + + +def cai_chatbot_wrapper(text, generate_state, name1, name2, context, mode, end_of_turn): + for history in chatbot_wrapper(text, generate_state, name1, name2, context, mode, end_of_turn): + yield chat_html_wrapper(history, name1, name2, mode) + + +def regenerate_wrapper(text, generate_state, name1, name2, context, mode, end_of_turn): + if (len(shared.history['visible']) == 1 and not shared.history['visible'][0][0]) or len(shared.history['internal']) == 0: + yield chat_html_wrapper(shared.history['visible'], name1, name2, mode) + else: + last_visible = shared.history['visible'].pop() + last_internal = shared.history['internal'].pop() + # Yield '*Is typing...*' + yield chat_html_wrapper(shared.history['visible'] + [[last_visible[0], shared.processing_message]], name1, name2, mode) + for history in chatbot_wrapper(last_internal[0], generate_state, name1, name2, context, mode, end_of_turn, regenerate=True): + shared.history['visible'][-1] = [last_visible[0], history[-1][1]] + yield chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def remove_last_message(name1, name2, mode): + if len(shared.history['visible']) > 0 and shared.history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>': + last = shared.history['visible'].pop() + shared.history['internal'].pop() + else: + last = ['', ''] + + return chat_html_wrapper(shared.history['visible'], name1, name2, mode), last[0] + + +def send_last_reply_to_input(): + if len(shared.history['internal']) > 0: + return shared.history['internal'][-1][1] + else: + return '' + + +def replace_last_reply(text, name1, name2, mode): + if len(shared.history['visible']) > 0: + shared.history['visible'][-1][1] = text + shared.history['internal'][-1][1] = apply_extensions(text, "input") + + return chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def clear_html(): + return chat_html_wrapper([], "", "") + + +def clear_chat_log(name1, name2, greeting, mode): + shared.history['visible'] = [] + shared.history['internal'] = [] + + if greeting != '': + shared.history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]] + shared.history['visible'] += [['', apply_extensions(greeting, "output")]] + + return chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def redraw_html(name1, name2, mode): + return chat_html_wrapper(shared.history['visible'], name1, name2, mode) + + +def tokenize_dialogue(dialogue, name1, name2, mode): + history = [] + messages = [] + dialogue = re.sub('', '', dialogue) + dialogue = re.sub('', '', dialogue) + dialogue = re.sub('(\n|^)[Aa]non:', '\\1You:', dialogue) + dialogue = re.sub('(\n|^)\[CHARACTER\]:', f'\\g<1>{name2}:', dialogue) + idx = [m.start() for m in re.finditer(f"(^|\n)({re.escape(name1)}|{re.escape(name2)}):", dialogue)] + if len(idx) == 0: + return history + + for i in range(len(idx) - 1): + messages.append(dialogue[idx[i]:idx[i + 1]].strip()) + messages.append(dialogue[idx[-1]:].strip()) + + entry = ['', ''] + for i in messages: + if i.startswith(f'{name1}:'): + entry[0] = i[len(f'{name1}:'):].strip() + elif i.startswith(f'{name2}:'): + entry[1] = i[len(f'{name2}:'):].strip() + if not (len(entry[0]) == 0 and len(entry[1]) == 0): + history.append(entry) + entry = ['', ''] + + print("\033[1;32;1m\nDialogue tokenized to:\033[0;37;0m\n", end='') + for row in history: + for column in row: + print("\n") + for line in column.strip().split('\n'): + print("| " + line + "\n") + print("|\n") + print("------------------------------") + + return history + + +def save_history(timestamp=True): + if timestamp: + fname = f"{shared.character}_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json" + else: + fname = f"{shared.character}_persistent.json" + if not Path('logs').exists(): + Path('logs').mkdir() + with open(Path(f'logs/{fname}'), 'w', encoding='utf-8') as f: + f.write(json.dumps({'data': shared.history['internal'], 'data_visible': shared.history['visible']}, indent=2)) + return Path(f'logs/{fname}') + + +def load_history(file, name1, name2): + file = file.decode('utf-8') + try: + j = json.loads(file) + if 'data' in j: + shared.history['internal'] = j['data'] + if 'data_visible' in j: + shared.history['visible'] = j['data_visible'] + else: + shared.history['visible'] = copy.deepcopy(shared.history['internal']) + # Compatibility with Pygmalion AI's official web UI + elif 'chat' in j: + shared.history['internal'] = [':'.join(x.split(':')[1:]).strip() for x in j['chat']] + if len(j['chat']) > 0 and j['chat'][0].startswith(f'{name2}:'): + shared.history['internal'] = [['<|BEGIN-VISIBLE-CHAT|>', shared.history['internal'][0]]] + [[shared.history['internal'][i], shared.history['internal'][i + 1]] for i in range(1, len(shared.history['internal']) - 1, 2)] + shared.history['visible'] = copy.deepcopy(shared.history['internal']) + shared.history['visible'][0][0] = '' + else: + shared.history['internal'] = [[shared.history['internal'][i], shared.history['internal'][i + 1]] for i in range(0, len(shared.history['internal']) - 1, 2)] + shared.history['visible'] = copy.deepcopy(shared.history['internal']) + except: + shared.history['internal'] = tokenize_dialogue(file, name1, name2) + shared.history['visible'] = copy.deepcopy(shared.history['internal']) + + +def replace_character_names(text, name1, name2): + text = text.replace('{{user}}', name1).replace('{{char}}', name2) + return text.replace('', name1).replace('', name2) + + +def build_pygmalion_style_context(data): + context = "" + if 'char_persona' in data and data['char_persona'] != '': + context += f"{data['char_name']}'s Persona: {data['char_persona']}\n" + if 'world_scenario' in data and data['world_scenario'] != '': + context += f"Scenario: {data['world_scenario']}\n" + context = f"{context.strip()}\n\n" + return context + + +def generate_pfp_cache(character): + cache_folder = Path("cache") + if not cache_folder.exists(): + cache_folder.mkdir() + + for path in [Path(f"characters/{character}.{extension}") for extension in ['png', 'jpg', 'jpeg']]: + if path.exists(): + img = make_thumbnail(Image.open(path)) + img.save(Path('cache/pfp_character.png'), format='PNG') + return img + return None + + +def load_character(character, name1, name2, mode): + shared.character = character + shared.history['internal'] = [] + shared.history['visible'] = [] + context = greeting = end_of_turn = "" + greeting_field = 'greeting' + picture = None + + # Deleting the profile picture cache, if any + if Path("cache/pfp_character.png").exists(): + Path("cache/pfp_character.png").unlink() + + if character != 'None': + folder = 'characters' if not mode == 'instruct' else 'characters/instruction-following' + picture = generate_pfp_cache(character) + for extension in ["yml", "yaml", "json"]: + filepath = Path(f'{folder}/{character}.{extension}') + if filepath.exists(): + break + file_contents = open(filepath, 'r', encoding='utf-8').read() + data = json.loads(file_contents) if extension == "json" else yaml.safe_load(file_contents) + + if 'your_name' in data and data['your_name'] != '': + name1 = data['your_name'] + name2 = data['name'] if 'name' in data else data['char_name'] + + for field in ['context', 'greeting', 'example_dialogue', 'char_persona', 'char_greeting', 'world_scenario']: + if field in data: + data[field] = replace_character_names(data[field], name1, name2) + + if 'context' in data: + context = f"{data['context'].strip()}\n\n" + elif "char_persona" in data: + context = build_pygmalion_style_context(data) + greeting_field = 'char_greeting' + + if 'example_dialogue' in data: + context += f"{data['example_dialogue'].strip()}\n" + if greeting_field in data: + greeting = data[greeting_field] + if 'end_of_turn' in data: + end_of_turn = data['end_of_turn'] + else: + context = shared.settings['context'] + name2 = shared.settings['name2'] + greeting = shared.settings['greeting'] + end_of_turn = shared.settings['end_of_turn'] + + if Path(f'logs/{shared.character}_persistent.json').exists(): + load_history(open(Path(f'logs/{shared.character}_persistent.json'), 'rb').read(), name1, name2) + elif greeting != "": + shared.history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]] + shared.history['visible'] += [['', apply_extensions(greeting, "output")]] + + return name1, name2, picture, greeting, context, end_of_turn, chat_html_wrapper(shared.history['visible'], name1, name2, mode, reset_cache=True) + + +def load_default_history(name1, name2): + load_character("None", name1, name2, "chat") + + +def upload_character(json_file, img, tavern=False): + json_file = json_file if type(json_file) == str else json_file.decode('utf-8') + data = json.loads(json_file) + outfile_name = data["char_name"] + i = 1 + while Path(f'characters/{outfile_name}.json').exists(): + outfile_name = f'{data["char_name"]}_{i:03d}' + i += 1 + if tavern: + outfile_name = f'TavernAI-{outfile_name}' + with open(Path(f'characters/{outfile_name}.json'), 'w', encoding='utf-8') as f: + f.write(json_file) + if img is not None: + img = Image.open(io.BytesIO(img)) + img.save(Path(f'characters/{outfile_name}.png')) + print(f'New character saved to "characters/{outfile_name}.json".') + return outfile_name + + +def upload_tavern_character(img, name1, name2): + _img = Image.open(io.BytesIO(img)) + _img.getexif() + decoded_string = base64.b64decode(_img.info['chara']) + _json = json.loads(decoded_string) + _json = {"char_name": _json['name'], "char_persona": _json['description'], "char_greeting": _json["first_mes"], "example_dialogue": _json['mes_example'], "world_scenario": _json['scenario']} + return upload_character(json.dumps(_json), img, tavern=True) + + +def upload_your_profile_picture(img, name1, name2, mode): + cache_folder = Path("cache") + if not cache_folder.exists(): + cache_folder.mkdir() + + if img is None: + if Path("cache/pfp_me.png").exists(): + Path("cache/pfp_me.png").unlink() + else: + img = make_thumbnail(img) + img.save(Path('cache/pfp_me.png')) + print('Profile picture saved to "cache/pfp_me.png"') + + return chat_html_wrapper(shared.history['visible'], name1, name2, mode, reset_cache=True) diff --git a/text-generation-webui/modules/deepspeed_parameters.py b/text-generation-webui/modules/deepspeed_parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..3dbed437f5b5196d0b1fcbc582085319fb8d40d1 --- /dev/null +++ b/text-generation-webui/modules/deepspeed_parameters.py @@ -0,0 +1,75 @@ +def generate_ds_config(ds_bf16, train_batch_size, nvme_offload_dir): + + ''' + DeepSpeed configration + https://huggingface.co./docs/transformers/main_classes/deepspeed + ''' + + if nvme_offload_dir: + ds_config = { + "fp16": { + "enabled": not ds_bf16, + }, + "bf16": { + "enabled": ds_bf16, + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "nvme", + "nvme_path": nvme_offload_dir, + "pin_memory": True, + "buffer_count": 5, + "buffer_size": 1e9, + "max_in_cpu": 1e9 + }, + "overlap_comm": True, + "reduce_bucket_size": "auto", + "contiguous_gradients": True, + "sub_group_size": 1e8, + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": "auto", + "stage3_max_reuse_distance": "auto", + }, + "aio": { + "block_size": 262144, + "queue_depth": 32, + "thread_count": 1, + "single_submit": False, + "overlap_events": True + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False + } + else: + ds_config = { + "fp16": { + "enabled": not ds_bf16, + }, + "bf16": { + "enabled": ds_bf16, + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + "pin_memory": True + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": "auto", + "stage3_max_reuse_distance": "auto", + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False + } + + return ds_config diff --git a/text-generation-webui/modules/extensions.py b/text-generation-webui/modules/extensions.py new file mode 100644 index 0000000000000000000000000000000000000000..eded405c4f3b464743074ddb13c5cfa0a18114d9 --- /dev/null +++ b/text-generation-webui/modules/extensions.py @@ -0,0 +1,72 @@ +import traceback + +import gradio as gr + +import extensions +import modules.shared as shared + +state = {} +available_extensions = [] +setup_called = set() + + +def load_extensions(): + global state, setup_called + for i, name in enumerate(shared.args.extensions): + if name in available_extensions: + print(f'Loading the extension "{name}"... ', end='') + try: + exec(f"import extensions.{name}.script") + extension = eval(f"extensions.{name}.script") + if extension not in setup_called and hasattr(extension, "setup"): + setup_called.add(extension) + extension.setup() + state[name] = [True, i] + print('Ok.') + except: + print('Fail.') + traceback.print_exc() + + +# This iterator returns the extensions in the order specified in the command-line +def iterator(): + for name in sorted(state, key=lambda x: state[x][1]): + if state[name][0]: + yield eval(f"extensions.{name}.script"), name + + +# Extension functions that map string -> string +def apply_extensions(text, typ): + for extension, _ in iterator(): + if typ == "input" and hasattr(extension, "input_modifier"): + text = extension.input_modifier(text) + elif typ == "output" and hasattr(extension, "output_modifier"): + text = extension.output_modifier(text) + elif typ == "bot_prefix" and hasattr(extension, "bot_prefix_modifier"): + text = extension.bot_prefix_modifier(text) + return text + + +def create_extensions_block(): + global setup_called + + # Updating the default values + for extension, name in iterator(): + if hasattr(extension, 'params'): + for param in extension.params: + _id = f"{name}-{param}" + if _id in shared.settings: + extension.params[param] = shared.settings[_id] + + should_display_ui = False + for extension, name in iterator(): + if hasattr(extension, "ui"): + should_display_ui = True + + # Creating the extension ui elements + if should_display_ui: + with gr.Column(elem_id="extensions"): + for extension, name in iterator(): + gr.Markdown(f"\n### {name}") + if hasattr(extension, "ui"): + extension.ui() diff --git a/text-generation-webui/modules/html_generator.py b/text-generation-webui/modules/html_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..6e20566c9baf97802364281d7313e721a0fd81e5 --- /dev/null +++ b/text-generation-webui/modules/html_generator.py @@ -0,0 +1,226 @@ +''' + +This is a library for formatting text outputs as nice HTML. + +''' + +import os +import re +import time +from pathlib import Path + +import markdown +from PIL import Image, ImageOps + +# This is to store the paths to the thumbnails of the profile pictures +image_cache = {} + +with open(Path(__file__).resolve().parent / '../css/html_readable_style.css', 'r') as f: + readable_css = f.read() +with open(Path(__file__).resolve().parent / '../css/html_4chan_style.css', 'r') as css_f: + _4chan_css = css_f.read() +with open(Path(__file__).resolve().parent / '../css/html_cai_style.css', 'r') as f: + cai_css = f.read() +with open(Path(__file__).resolve().parent / '../css/html_instruct_style.css', 'r') as f: + instruct_css = f.read() + + +def fix_newlines(string): + string = string.replace('\n', '\n\n') + string = re.sub(r"\n{3,}", "\n\n", string) + string = string.strip() + return string + +# This could probably be generalized and improved + + +def convert_to_markdown(string): + string = string.replace('\\begin{code}', '```') + string = string.replace('\\end{code}', '```') + string = string.replace('\\begin{blockquote}', '> ') + string = string.replace('\\end{blockquote}', '') + string = re.sub(r"(.)```", r"\1\n```", string) + string = fix_newlines(string) + return markdown.markdown(string, extensions=['fenced_code']) + + +def generate_basic_html(string): + string = convert_to_markdown(string) + string = f'
{string}
' + return string + + +def process_post(post, c): + t = post.split('\n') + number = t[0].split(' ')[1] + if len(t) > 1: + src = '\n'.join(t[1:]) + else: + src = '' + src = re.sub('>', '>', src) + src = re.sub('(>>[0-9]*)', '\\1', src) + src = re.sub('\n', '
\n', src) + src = f'
{src}\n' + src = f'Anonymous No.{number}\n{src}' + return src + + +def generate_4chan_html(f): + posts = [] + post = '' + c = -2 + for line in f.splitlines(): + line += "\n" + if line == '-----\n': + continue + elif line.startswith('--- '): + c += 1 + if post != '': + src = process_post(post, c) + posts.append(src) + post = line + else: + post += line + if post != '': + src = process_post(post, c) + posts.append(src) + + for i in range(len(posts)): + if i == 0: + posts[i] = f'
{posts[i]}
\n' + else: + posts[i] = f'
{posts[i]}
\n' + + output = '' + output += f'
' + for post in posts: + output += post + output += '
' + output = output.split('\n') + for i in range(len(output)): + output[i] = re.sub(r'^(>(.*?)(
|))', r'\1', output[i]) + output[i] = re.sub(r'^
(>(.*?)(
|))', r'
\1', output[i]) + output = '\n'.join(output) + + return output + + +def make_thumbnail(image): + image = image.resize((350, round(image.size[1] / image.size[0] * 350)), Image.Resampling.LANCZOS) + if image.size[1] > 470: + image = ImageOps.fit(image, (350, 470), Image.ANTIALIAS) + + return image + + +def get_image_cache(path): + cache_folder = Path("cache") + if not cache_folder.exists(): + cache_folder.mkdir() + + mtime = os.stat(path).st_mtime + if (path in image_cache and mtime != image_cache[path][0]) or (path not in image_cache): + img = make_thumbnail(Image.open(path)) + output_file = Path(f'cache/{path.name}_cache.png') + img.convert('RGB').save(output_file, format='PNG') + image_cache[path] = [mtime, output_file.as_posix()] + + return image_cache[path][1] + + +def generate_instruct_html(history): + output = f'
' + for i, _row in enumerate(history[::-1]): + row = [convert_to_markdown(entry) for entry in _row] + + output += f""" +
+
+
+ {row[1]} +
+
+
+ """ + + if len(row[0]) == 0: # don't display empty user messages + continue + + output += f""" +
+
+
+ {row[0]} +
+
+
+ """ + + output += "
" + + return output + + +def generate_cai_chat_html(history, name1, name2, reset_cache=False): + output = f'
' + + # The time.time() is to prevent the brower from caching the image + suffix = f"?{time.time()}" if reset_cache else f"?{name2}" + img_bot = f'' if Path("cache/pfp_character.png").exists() else '' + img_me = f'' if Path("cache/pfp_me.png").exists() else '' + + for i, _row in enumerate(history[::-1]): + row = [convert_to_markdown(entry) for entry in _row] + + output += f""" +
+
+ {img_bot} +
+
+
+ {name2} +
+
+ {row[1]} +
+
+
+ """ + + if len(row[0]) == 0: # don't display empty user messages + continue + + output += f""" +
+
+ {img_me} +
+
+
+ {name1} +
+
+ {row[0]} +
+
+
+ """ + + output += "
" + return output + + +def generate_chat_html(history, name1, name2): + return generate_cai_chat_html(history, name1, name2) + + +def chat_html_wrapper(history, name1, name2, mode, reset_cache=False): + if mode == "cai-chat": + return generate_cai_chat_html(history, name1, name2, reset_cache) + elif mode == "chat": + return generate_chat_html(history, name1, name2) + elif mode == "instruct": + return generate_instruct_html(history) + else: + return '' diff --git a/text-generation-webui/modules/llamacpp_model.py b/text-generation-webui/modules/llamacpp_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9461db109c0c172b97b11da075a9adcf30a12254 --- /dev/null +++ b/text-generation-webui/modules/llamacpp_model.py @@ -0,0 +1,82 @@ +import multiprocessing + +import llamacpp + +from modules import shared +from modules.callbacks import Iteratorize + + +class LlamaCppTokenizer: + """A thin wrapper over the llamacpp tokenizer""" + def __init__(self, model: llamacpp.LlamaInference): + self._tokenizer = model.get_tokenizer() + self.eos_token_id = 2 + self.bos_token_id = 0 + + @classmethod + def from_model(cls, model: llamacpp.LlamaInference): + return cls(model) + + def encode(self, prompt: str): + return self._tokenizer.tokenize(prompt) + + def decode(self, ids): + return self._tokenizer.detokenize(ids) + + +class LlamaCppModel: + def __init__(self): + self.initialized = False + + @classmethod + def from_pretrained(self, path): + params = llamacpp.InferenceParams() + params.path_model = str(path) + params.n_threads = shared.args.threads or multiprocessing.cpu_count() // 2 + + _model = llamacpp.LlamaInference(params) + + result = self() + result.model = _model + result.params = params + + tokenizer = LlamaCppTokenizer.from_model(_model) + return result, tokenizer + + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): + params = self.params + params.n_predict = token_count + params.top_p = top_p + params.top_k = top_k + params.temp = temperature + params.repeat_penalty = repetition_penalty + # params.repeat_last_n = repeat_last_n + + # self.model.params = params + self.model.add_bos() + self.model.update_input(context) + + output = "" + is_end_of_text = False + ctr = 0 + while ctr < token_count and not is_end_of_text: + if self.model.has_unconsumed_input(): + self.model.ingest_all_pending_input() + else: + self.model.eval() + token = self.model.sample() + text = self.model.token_to_str(token) + output += text + is_end_of_text = token == self.model.token_eos() + if callback: + callback(text) + ctr += 1 + + return output + + def generate_with_streaming(self, **kwargs): + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = '' + for token in generator: + reply += token + yield reply diff --git a/text-generation-webui/modules/llamacpp_model_alternative.py b/text-generation-webui/modules/llamacpp_model_alternative.py new file mode 100644 index 0000000000000000000000000000000000000000..8fea2ab430aa2b031b16ca866e9f6852148e068f --- /dev/null +++ b/text-generation-webui/modules/llamacpp_model_alternative.py @@ -0,0 +1,63 @@ +''' +Based on +https://github.com/abetlen/llama-cpp-python + +Documentation: +https://abetlen.github.io/llama-cpp-python/ +''' + +from llama_cpp import Llama + +from modules import shared +from modules.callbacks import Iteratorize + + +class LlamaCppModel: + def __init__(self): + self.initialized = False + + @classmethod + def from_pretrained(self, path): + result = self() + + params = { + 'model_path': str(path), + 'n_ctx': 2048, + 'seed': 0, + 'n_threads': shared.args.threads or None + } + self.model = Llama(**params) + + # This is ugly, but the model and the tokenizer are the same object in this library. + return result, result + + def encode(self, string): + if type(string) is str: + string = string.encode() + return self.model.tokenize(string) + + def generate(self, context="", token_count=20, temperature=1, top_p=1, top_k=50, repetition_penalty=1, callback=None): + if type(context) is str: + context = context.encode() + tokens = self.model.tokenize(context) + + output = b"" + count = 0 + for token in self.model.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty): + text = self.model.detokenize([token]) + output += text + if callback: + callback(text.decode()) + + count += 1 + if count >= token_count or (token == self.model.token_eos()): + break + + return output.decode() + + def generate_with_streaming(self, **kwargs): + with Iteratorize(self.generate, kwargs, callback=None) as generator: + reply = '' + for token in generator: + reply += token + yield reply diff --git a/text-generation-webui/modules/models.py b/text-generation-webui/modules/models.py new file mode 100644 index 0000000000000000000000000000000000000000..5e2b098995d8b58d07ae37cad7b48bad041b7bcb --- /dev/null +++ b/text-generation-webui/modules/models.py @@ -0,0 +1,211 @@ +import json +import os +import re +import time +import zipfile +from pathlib import Path + +import numpy as np +import torch +import transformers +from accelerate import infer_auto_device_map, init_empty_weights +from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig, LlamaTokenizer) + +import modules.shared as shared + +transformers.logging.set_verbosity_error() + +local_rank = None + +if shared.args.flexgen: + from flexgen.flex_opt import CompressionConfig, ExecutionEnv, OptLM, Policy + +if shared.args.deepspeed: + import deepspeed + from transformers.deepspeed import (HfDeepSpeedConfig, + is_deepspeed_zero3_enabled) + + from modules.deepspeed_parameters import generate_ds_config + + # Distributed setup + local_rank = shared.args.local_rank if shared.args.local_rank is not None else int(os.getenv("LOCAL_RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "1")) + torch.cuda.set_device(local_rank) + deepspeed.init_distributed() + ds_config = generate_ds_config(shared.args.bf16, 1 * world_size, shared.args.nvme_offload_dir) + dschf = HfDeepSpeedConfig(ds_config) # Keep this object alive for the Transformers integration + + +def load_model(model_name): + print(f"Loading {model_name}...") + t0 = time.time() + + shared.is_RWKV = 'rwkv-' in model_name.lower() + shared.is_llamacpp = len(list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))) > 0 + + # Default settings + if not any([shared.args.cpu, shared.args.load_in_8bit, shared.args.wbits, shared.args.auto_devices, shared.args.disk, shared.args.gpu_memory is not None, shared.args.cpu_memory is not None, shared.args.deepspeed, shared.args.flexgen, shared.is_RWKV, shared.is_llamacpp]): + if any(size in shared.model_name.lower() for size in ('13b', '20b', '30b')): + model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), device_map='auto', load_in_8bit=True) + else: + model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), low_cpu_mem_usage=True, torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16) + if torch.has_mps: + device = torch.device('mps') + model = model.to(device) + else: + model = model.cuda() + + # FlexGen + elif shared.args.flexgen: + # Initialize environment + env = ExecutionEnv.create(shared.args.disk_cache_dir) + + # Offloading policy + policy = Policy(1, 1, + shared.args.percent[0], shared.args.percent[1], + shared.args.percent[2], shared.args.percent[3], + shared.args.percent[4], shared.args.percent[5], + overlap=True, sep_layer=True, pin_weight=shared.args.pin_weight, + cpu_cache_compute=False, attn_sparsity=1.0, + compress_weight=shared.args.compress_weight, + comp_weight_config=CompressionConfig( + num_bits=4, group_size=64, + group_dim=0, symmetric=False), + compress_cache=False, + comp_cache_config=CompressionConfig( + num_bits=4, group_size=64, + group_dim=2, symmetric=False)) + + model = OptLM(f"facebook/{shared.model_name}", env, shared.args.model_dir, policy) + + # DeepSpeed ZeRO-3 + elif shared.args.deepspeed: + model = AutoModelForCausalLM.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}"), torch_dtype=torch.bfloat16 if shared.args.bf16 else torch.float16) + model = deepspeed.initialize(model=model, config_params=ds_config, model_parameters=None, optimizer=None, lr_scheduler=None)[0] + model.module.eval() # Inference + print(f"DeepSpeed ZeRO-3 is enabled: {is_deepspeed_zero3_enabled()}") + + # RMKV model (not on HuggingFace) + elif shared.is_RWKV: + from modules.RWKV import RWKVModel, RWKVTokenizer + + model = RWKVModel.from_pretrained(Path(f'{shared.args.model_dir}/{model_name}'), dtype="fp32" if shared.args.cpu else "bf16" if shared.args.bf16 else "fp16", device="cpu" if shared.args.cpu else "cuda") + tokenizer = RWKVTokenizer.from_pretrained(Path(shared.args.model_dir)) + + return model, tokenizer + + # Quantized model + elif shared.args.wbits > 0: + from modules.GPTQ_loader import load_quantized + + model = load_quantized(model_name) + + # llamacpp model + elif shared.is_llamacpp: + from modules.llamacpp_model_alternative import LlamaCppModel + + model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('ggml*.bin'))[0] + print(f"llama.cpp weights detected: {model_file}\n") + + model, tokenizer = LlamaCppModel.from_pretrained(model_file) + return model, tokenizer + + # Custom + else: + params = {"low_cpu_mem_usage": True} + if not any((shared.args.cpu, torch.cuda.is_available(), torch.has_mps)): + print("Warning: torch.cuda.is_available() returned False.\nThis means that no GPU has been detected.\nFalling back to CPU mode.\n") + shared.args.cpu = True + + if shared.args.cpu: + params["torch_dtype"] = torch.float32 + else: + params["device_map"] = 'auto' + if shared.args.load_in_8bit and any((shared.args.auto_devices, shared.args.gpu_memory)): + params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True) + elif shared.args.load_in_8bit: + params['quantization_config'] = BitsAndBytesConfig(load_in_8bit=True) + elif shared.args.bf16: + params["torch_dtype"] = torch.bfloat16 + else: + params["torch_dtype"] = torch.float16 + + if shared.args.gpu_memory: + memory_map = list(map(lambda x: x.strip(), shared.args.gpu_memory)) + max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' + max_memory = {} + for i in range(len(memory_map)): + max_memory[i] = f'{memory_map[i]}GiB' if not re.match('.*ib$', memory_map[i].lower()) else memory_map[i] + max_memory['cpu'] = max_cpu_memory + params['max_memory'] = max_memory + elif shared.args.auto_devices: + total_mem = (torch.cuda.get_device_properties(0).total_memory / (1024 * 1024)) + suggestion = round((total_mem - 1000) / 1000) * 1000 + if total_mem - suggestion < 800: + suggestion -= 1000 + suggestion = int(round(suggestion / 1000)) + print(f"\033[1;32;1mAuto-assiging --gpu-memory {suggestion} for your GPU to try to prevent out-of-memory errors.\nYou can manually set other values.\033[0;37;0m") + + max_memory = {0: f'{suggestion}GiB', 'cpu': f'{shared.args.cpu_memory or 99}GiB'} + params['max_memory'] = max_memory + + if shared.args.disk: + params["offload_folder"] = shared.args.disk_cache_dir + + checkpoint = Path(f'{shared.args.model_dir}/{shared.model_name}') + + if shared.args.load_in_8bit and params.get('max_memory', None) is not None and params['device_map'] == 'auto': + config = AutoConfig.from_pretrained(checkpoint) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + model.tie_weights() + params['device_map'] = infer_auto_device_map( + model, + dtype=torch.int8, + max_memory=params['max_memory'], + no_split_module_classes=model._no_split_modules + ) + + model = AutoModelForCausalLM.from_pretrained(checkpoint, **params) + + # Loading the tokenizer + if any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])) and Path(f"{shared.args.model_dir}/gpt-j-6B/").exists(): + tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/gpt-j-6B/")) + elif type(model) is transformers.LlamaForCausalLM: + tokenizer = LlamaTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}/"), clean_up_tokenization_spaces=True) + else: + tokenizer = AutoTokenizer.from_pretrained(Path(f"{shared.args.model_dir}/{shared.model_name}/")) + tokenizer.truncation_side = 'left' + + print(f"Loaded the model in {(time.time()-t0):.2f} seconds.") + return model, tokenizer + + +def load_soft_prompt(name): + if name == 'None': + shared.soft_prompt = False + shared.soft_prompt_tensor = None + else: + with zipfile.ZipFile(Path(f'softprompts/{name}.zip')) as zf: + zf.extract('tensor.npy') + zf.extract('meta.json') + j = json.loads(open('meta.json', 'r').read()) + print(f"\nLoading the softprompt \"{name}\".") + for field in j: + if field != 'name': + if type(j[field]) is list: + print(f"{field}: {', '.join(j[field])}") + else: + print(f"{field}: {j[field]}") + print() + tensor = np.load('tensor.npy') + Path('tensor.npy').unlink() + Path('meta.json').unlink() + tensor = torch.Tensor(tensor).to(device=shared.model.device, dtype=shared.model.dtype) + tensor = torch.reshape(tensor, (1, tensor.shape[0], tensor.shape[1])) + + shared.soft_prompt = True + shared.soft_prompt_tensor = tensor + + return name diff --git a/text-generation-webui/modules/shared.py b/text-generation-webui/modules/shared.py new file mode 100644 index 0000000000000000000000000000000000000000..7ff1ca28e2217d1461cd7f563e37845c1c4c2334 --- /dev/null +++ b/text-generation-webui/modules/shared.py @@ -0,0 +1,152 @@ +import argparse + +model = None +tokenizer = None +model_name = "None" +lora_name = "None" +soft_prompt_tensor = None +soft_prompt = False +is_RWKV = False +is_llamacpp = False + +# Chat variables +history = {'internal': [], 'visible': []} +character = 'None' +stop_everything = False +processing_message = '*Is typing...*' + +# UI elements (buttons, sliders, HTML, etc) +gradio = {} + +# Generation input parameters +input_params = [] + +# For restarting the interface +need_restart = False + +settings = { + 'max_new_tokens': 200, + 'max_new_tokens_min': 1, + 'max_new_tokens_max': 2000, + 'seed': -1, + 'name1': 'You', + 'name2': 'Assistant', + 'context': 'This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.', + 'greeting': 'Hello there!', + 'end_of_turn': '', + 'stop_at_newline': False, + 'chat_prompt_size': 2048, + 'chat_prompt_size_min': 0, + 'chat_prompt_size_max': 2048, + 'chat_generation_attempts': 1, + 'chat_generation_attempts_min': 1, + 'chat_generation_attempts_max': 5, + 'default_extensions': [], + 'chat_default_extensions': ["gallery"], + 'presets': { + 'default': 'NovelAI-Sphinx Moth', + '.*(alpaca|llama)': "LLaMA-Precise", + '.*pygmalion': 'NovelAI-Storywriter', + '.*RWKV': 'Naive', + }, + 'prompts': { + 'default': 'QA', + '.*(gpt4chan|gpt-4chan|4chan)': 'GPT-4chan', + '.*oasst': 'Open Assistant', + '.*alpaca': "Alpaca", + }, + 'lora_prompts': { + 'default': 'QA', + '.*(alpaca-lora-7b|alpaca-lora-13b|alpaca-lora-30b)': "Alpaca", + } +} + + +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + + +parser = argparse.ArgumentParser(formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=54)) + +# Basic settings +parser.add_argument('--notebook', action='store_true', help='Launch the web UI in notebook mode, where the output is written to the same text box as the input.') +parser.add_argument('--chat', action='store_true', help='Launch the web UI in chat mode with a style similar to the Character.AI website.') +parser.add_argument('--cai-chat', action='store_true', help='DEPRECATED: use --chat instead.') +parser.add_argument('--model', type=str, help='Name of the model to load by default.') +parser.add_argument('--lora', type=str, help='Name of the LoRA to apply to the model by default.') +parser.add_argument("--model-dir", type=str, default='models/', help="Path to directory with all the models") +parser.add_argument("--lora-dir", type=str, default='loras/', help="Path to directory with all the loras") +parser.add_argument('--no-stream', action='store_true', help='Don\'t stream the text output in real time.') +parser.add_argument('--settings', type=str, help='Load the default interface settings from this json file. See settings-template.json for an example. If you create a file called settings.json, this file will be loaded by default without the need to use the --settings flag.') +parser.add_argument('--extensions', type=str, nargs="+", help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') +parser.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') + +# Accelerate/transformers +parser.add_argument('--cpu', action='store_true', help='Use the CPU to generate text.') +parser.add_argument('--auto-devices', action='store_true', help='Automatically split the model across the available GPU(s) and CPU.') +parser.add_argument('--gpu-memory', type=str, nargs="+", help='Maxmimum GPU memory in GiB to be allocated per GPU. Example: --gpu-memory 10 for a single GPU, --gpu-memory 10 5 for two GPUs. You can also set values in MiB like --gpu-memory 3500MiB.') +parser.add_argument('--cpu-memory', type=str, help='Maximum CPU memory in GiB to allocate for offloaded weights. Same as above.') +parser.add_argument('--disk', action='store_true', help='If the model is too large for your GPU(s) and CPU combined, send the remaining layers to the disk.') +parser.add_argument('--disk-cache-dir', type=str, default="cache", help='Directory to save the disk cache to. Defaults to "cache".') +parser.add_argument('--load-in-8bit', action='store_true', help='Load the model with 8-bit precision.') +parser.add_argument('--bf16', action='store_true', help='Load the model with bfloat16 precision. Requires NVIDIA Ampere GPU.') +parser.add_argument('--no-cache', action='store_true', help='Set use_cache to False while generating text. This reduces the VRAM usage a bit at a performance cost.') + +# llama.cpp +parser.add_argument('--threads', type=int, default=0, help='Number of threads to use in llama.cpp.') + +# GPTQ +parser.add_argument('--wbits', type=int, default=0, help='GPTQ: Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') +parser.add_argument('--model_type', type=str, help='GPTQ: Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') +parser.add_argument('--groupsize', type=int, default=-1, help='GPTQ: Group size.') +parser.add_argument('--pre_layer', type=int, default=0, help='GPTQ: The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models.') +parser.add_argument('--gptq-bits', type=int, default=0, help='DEPRECATED: use --wbits instead.') +parser.add_argument('--gptq-model-type', type=str, help='DEPRECATED: use --model_type instead.') +parser.add_argument('--gptq-pre-layer', type=int, default=0, help='DEPRECATED: use --pre_layer instead.') + +# FlexGen +parser.add_argument('--flexgen', action='store_true', help='Enable the use of FlexGen offloading.') +parser.add_argument('--percent', type=int, nargs="+", default=[0, 100, 100, 0, 100, 0], help='FlexGen: allocation percentages. Must be 6 numbers separated by spaces (default: 0, 100, 100, 0, 100, 0).') +parser.add_argument("--compress-weight", action="store_true", help="FlexGen: activate weight compression.") +parser.add_argument("--pin-weight", type=str2bool, nargs="?", const=True, default=True, help="FlexGen: whether to pin weights (setting this to False reduces CPU memory by 20%%).") + +# DeepSpeed +parser.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') +parser.add_argument('--nvme-offload-dir', type=str, help='DeepSpeed: Directory to use for ZeRO-3 NVME offloading.') +parser.add_argument('--local_rank', type=int, default=0, help='DeepSpeed: Optional argument for distributed setups.') + +# RWKV +parser.add_argument('--rwkv-strategy', type=str, default=None, help='RWKV: The strategy to use while loading the model. Examples: "cpu fp32", "cuda fp16", "cuda fp16i8".') +parser.add_argument('--rwkv-cuda-on', action='store_true', help='RWKV: Compile the CUDA kernel for better performance.') + +# Gradio +parser.add_argument('--listen', action='store_true', help='Make the web UI reachable from your local network.') +parser.add_argument('--listen-port', type=int, help='The listening port that the server will use.') +parser.add_argument('--share', action='store_true', help='Create a public URL. This is useful for running the web UI on Google Colab or similar.') +parser.add_argument('--auto-launch', action='store_true', default=False, help='Open the web UI in the default browser upon launch.') +parser.add_argument("--gradio-auth-path", type=str, help='Set the gradio authentication file path. The file should contain one or more user:password pairs in this format: "u1:p1,u2:p2,u3:p3"', default=None) + +args = parser.parse_args() + +# Deprecation warnings for parameters that have been renamed +deprecated_dict = {'gptq_bits': ['wbits', 0], 'gptq_model_type': ['model_type', None], 'gptq_pre_layer': ['prelayer', 0]} +for k in deprecated_dict: + if eval(f"args.{k}") != deprecated_dict[k][1]: + print(f"Warning: --{k} is deprecated and will be removed. Use --{deprecated_dict[k][0]} instead.") + exec(f"args.{deprecated_dict[k][0]} = args.{k}") + +# Deprecation warnings for parameters that have been removed +if args.cai_chat: + print("Warning: --cai-chat is deprecated. Use --chat instead.") + args.chat = True + + +def is_chat(): + return args.chat diff --git a/text-generation-webui/modules/text_generation.py b/text-generation-webui/modules/text_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..9719c5a9c607e48c61fce267c9fe5681cd8cc19c --- /dev/null +++ b/text-generation-webui/modules/text_generation.py @@ -0,0 +1,287 @@ +import gc +import re +import time +import traceback + +import numpy as np +import torch +import transformers + +import modules.shared as shared +from modules.callbacks import (Iteratorize, Stream, + _SentinelTokenStoppingCriteria) +from modules.extensions import apply_extensions +from modules.html_generator import generate_4chan_html, generate_basic_html +from modules.models import local_rank + + +def get_max_prompt_length(tokens): + max_length = 2048 - tokens + if shared.soft_prompt: + max_length -= shared.soft_prompt_tensor.shape[1] + return max_length + + +def encode(prompt, tokens_to_generate=0, add_special_tokens=True): + if any((shared.is_RWKV, shared.is_llamacpp)): + input_ids = shared.tokenizer.encode(str(prompt)) + input_ids = np.array(input_ids).reshape(1, len(input_ids)) + return input_ids + else: + input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', truncation=True, max_length=get_max_prompt_length(tokens_to_generate), add_special_tokens=add_special_tokens) + + if type(shared.tokenizer) is transformers.LlamaTokenizer and input_ids[0][0] == 29871: + input_ids = input_ids[:, 1:] + + if shared.args.cpu: + return input_ids + elif shared.args.flexgen: + return input_ids.numpy() + elif shared.args.deepspeed: + return input_ids.to(device=local_rank) + elif torch.has_mps: + device = torch.device('mps') + return input_ids.to(device) + else: + return input_ids.cuda() + + +def decode(output_ids): + # Open Assistant relies on special tokens like <|endoftext|> + if re.match('.*(oasst|galactica)-*', shared.model_name.lower()): + return shared.tokenizer.decode(output_ids, skip_special_tokens=False) + else: + reply = shared.tokenizer.decode(output_ids, skip_special_tokens=True) + reply = reply.replace(r'<|endoftext|>', '') + return reply + + +def generate_softprompt_input_tensors(input_ids): + inputs_embeds = shared.model.transformer.wte(input_ids) + inputs_embeds = torch.cat((shared.soft_prompt_tensor, inputs_embeds), dim=1) + filler_input_ids = torch.zeros((1, inputs_embeds.shape[1]), dtype=input_ids.dtype).to(shared.model.device) + # filler_input_ids += shared.model.config.bos_token_id # setting dummy input_ids to bos tokens + return inputs_embeds, filler_input_ids + +# Removes empty replies from gpt4chan outputs + + +def fix_gpt4chan(s): + for i in range(10): + s = re.sub("--- [0-9]*\n>>[0-9]*\n---", "---", s) + s = re.sub("--- [0-9]*\n *\n---", "---", s) + s = re.sub("--- [0-9]*\n\n\n---", "---", s) + return s + +# Fix the LaTeX equations in galactica + + +def fix_galactica(s): + s = s.replace(r'\[', r'$') + s = s.replace(r'\]', r'$') + s = s.replace(r'\(', r'$') + s = s.replace(r'\)', r'$') + s = s.replace(r'$$', r'$') + s = re.sub(r'\n', r'\n\n', s) + s = re.sub(r"\n{3,}", "\n\n", s) + return s + + +def formatted_outputs(reply, model_name): + if not shared.is_chat(): + if 'galactica' in model_name.lower(): + reply = fix_galactica(reply) + return reply, reply, generate_basic_html(reply) + elif any((k in shared.model_name.lower() for k in ['gpt4chan', 'gpt-4chan'])): + reply = fix_gpt4chan(reply) + return reply, 'Only applicable for GALACTICA models.', generate_4chan_html(reply) + else: + return reply, 'Only applicable for GALACTICA models.', generate_basic_html(reply) + else: + return reply + + +def clear_torch_cache(): + gc.collect() + if not shared.args.cpu: + torch.cuda.empty_cache() + + +def set_manual_seed(seed): + if seed != -1: + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def stop_everything_event(): + shared.stop_everything = True + + +def generate_reply(question, generate_state, eos_token=None, stopping_strings=[]): + clear_torch_cache() + set_manual_seed(generate_state['seed']) + shared.stop_everything = False + generate_params = {} + t0 = time.time() + + original_question = question + if not shared.is_chat(): + question = apply_extensions(question, 'input') + if shared.args.verbose: + print(f'\n\n{question}\n--------------------\n') + + # These models are not part of Hugging Face, so we handle them + # separately and terminate the function call earlier + if any((shared.is_RWKV, shared.is_llamacpp)): + for k in ['temperature', 'top_p', 'top_k', 'repetition_penalty']: + generate_params[k] = generate_state[k] + generate_params['token_count'] = generate_state['max_new_tokens'] + try: + if shared.args.no_stream: + reply = shared.model.generate(context=question, **generate_params) + output = original_question + reply + if not shared.is_chat(): + reply = original_question + apply_extensions(reply, 'output') + yield formatted_outputs(reply, shared.model_name) + else: + if not shared.is_chat(): + yield formatted_outputs(question, shared.model_name) + + # RWKV has proper streaming, which is very nice. + # No need to generate 8 tokens at a time. + for reply in shared.model.generate_with_streaming(context=question, **generate_params): + output = original_question + reply + if not shared.is_chat(): + reply = original_question + apply_extensions(reply, 'output') + yield formatted_outputs(reply, shared.model_name) + + except Exception: + traceback.print_exc() + finally: + t1 = time.time() + original_tokens = len(encode(original_question)[0]) + new_tokens = len(encode(output)[0]) - original_tokens + print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens})') + return + + input_ids = encode(question, generate_state['max_new_tokens']) + original_input_ids = input_ids + output = input_ids[0] + + cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen)) + eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else [] + if eos_token is not None: + eos_token_ids.append(int(encode(eos_token)[0][-1])) + stopping_criteria_list = transformers.StoppingCriteriaList() + if type(stopping_strings) is list and len(stopping_strings) > 0: + t = [encode(string, 0, add_special_tokens=False) for string in stopping_strings] + stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=t, starting_idx=len(input_ids[0]))) + + if not shared.args.flexgen: + for k in ['max_new_tokens', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']: + generate_params[k] = generate_state[k] + generate_params['eos_token_id'] = eos_token_ids + generate_params['stopping_criteria'] = stopping_criteria_list + if shared.args.no_stream: + generate_params['min_length'] = 0 + else: + for k in ['max_new_tokens', 'do_sample', 'temperature']: + generate_params[k] = generate_state[k] + generate_params['stop'] = generate_state['eos_token_ids'][-1] + if not shared.args.no_stream: + generate_params['max_new_tokens'] = 8 + + if shared.args.no_cache: + generate_params.update({'use_cache': False}) + if shared.args.deepspeed: + generate_params.update({'synced_gpus': True}) + if shared.soft_prompt: + inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids) + generate_params.update({'inputs_embeds': inputs_embeds}) + generate_params.update({'inputs': filler_input_ids}) + else: + generate_params.update({'inputs': input_ids}) + + try: + # Generate the entire reply at once. + if shared.args.no_stream: + with torch.no_grad(): + output = shared.model.generate(**generate_params)[0] + if cuda: + output = output.cuda() + if shared.soft_prompt: + output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) + if not shared.is_chat(): + reply = original_question + apply_extensions(reply, 'output') + + yield formatted_outputs(reply, shared.model_name) + + # Stream the reply 1 token at a time. + # This is based on the trick of using 'stopping_criteria' to create an iterator. + elif not shared.args.flexgen: + + def generate_with_callback(callback=None, **kwargs): + kwargs['stopping_criteria'].append(Stream(callback_func=callback)) + clear_torch_cache() + with torch.no_grad(): + shared.model.generate(**kwargs) + + def generate_with_streaming(**kwargs): + return Iteratorize(generate_with_callback, kwargs, callback=None) + + if not shared.is_chat(): + yield formatted_outputs(original_question, shared.model_name) + with generate_with_streaming(**generate_params) as generator: + for output in generator: + if shared.soft_prompt: + output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(input_ids[0]) + reply = decode(output[-new_tokens:]) + if not shared.is_chat(): + reply = original_question + apply_extensions(reply, 'output') + + if output[-1] in eos_token_ids: + break + yield formatted_outputs(reply, shared.model_name) + + # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria' + else: + for i in range(generate_state['max_new_tokens'] // 8 + 1): + clear_torch_cache() + with torch.no_grad(): + output = shared.model.generate(**generate_params)[0] + if shared.soft_prompt: + output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:])) + + new_tokens = len(output) - len(original_input_ids[0]) + reply = decode(output[-new_tokens:]) + if not shared.is_chat(): + reply = original_question + apply_extensions(reply, 'output') + + if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)): + break + yield formatted_outputs(reply, shared.model_name) + + input_ids = np.reshape(output, (1, output.shape[0])) + if shared.soft_prompt: + inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids) + generate_params.update({'inputs_embeds': inputs_embeds}) + generate_params.update({'inputs': filler_input_ids}) + else: + generate_params.update({'inputs': input_ids}) + + yield formatted_outputs(reply, shared.model_name) + + except Exception: + traceback.print_exc() + finally: + t1 = time.time() + original_tokens = len(original_input_ids[0]) + new_tokens = len(output) - original_tokens + print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens})') + return diff --git a/text-generation-webui/modules/training.py b/text-generation-webui/modules/training.py new file mode 100644 index 0000000000000000000000000000000000000000..51072846733b19ab185c8d633ed1c713a2d4f7e4 --- /dev/null +++ b/text-generation-webui/modules/training.py @@ -0,0 +1,339 @@ +import json +import sys +import threading +import time +import traceback +from pathlib import Path + +import gradio as gr +import torch +import transformers +from datasets import Dataset, load_dataset +from peft import (LoraConfig, get_peft_model, get_peft_model_state_dict, + prepare_model_for_int8_training) + +from modules import shared, ui + +WANT_INTERRUPT = False +CURRENT_STEPS = 0 +MAX_STEPS = 0 +CURRENT_GRADIENT_ACCUM = 1 + + +def get_dataset(path: str, ext: str): + return ['None'] + sorted(set([k.stem for k in Path(path).glob(f'*.{ext}') if k.stem != 'put-trainer-datasets-here']), key=str.lower) + + +def create_train_interface(): + with gr.Tab('Train LoRA', elem_id='lora-train-tab'): + lora_name = gr.Textbox(label="Name", info="The name of your new LoRA file") + with gr.Row(): + # TODO: Implement multi-device support. + micro_batch_size = gr.Slider(label='Micro Batch Size', value=4, minimum=1, maximum=128, step=1, info='Per-device batch size (NOTE: multiple devices not yet implemented). Increasing this will increase VRAM usage.') + batch_size = gr.Slider(label='Batch Size', value=128, minimum=0, maximum=1024, step=4, info='Global batch size. The two batch sizes together determine gradient accumulation (gradientAccum = batch / microBatch). Higher gradient accum values lead to better quality training.') + + with gr.Row(): + epochs = gr.Number(label='Epochs', value=3, info='Number of times every entry in the dataset should be fed into training. So 1 means feed each item in once, 5 means feed it in five times, etc.') + learning_rate = gr.Textbox(label='Learning Rate', value='3e-4', info='Learning rate, in scientific notation. 3e-4 is a good starting base point. 1e-2 is extremely high, 1e-6 is extremely low.') + + # TODO: What is the actual maximum rank? Likely distinct per model. This might be better to somehow be on a log scale. + lora_rank = gr.Slider(label='LoRA Rank', value=32, minimum=0, maximum=1024, step=4, info='LoRA Rank, or dimension count. Higher values produce a larger file with better control over the model\'s content. Smaller values produce a smaller file with less overall control. Small values like 4 or 8 are great for stylistic guidance, high values like 128 or 256 are good for teaching content upgrades. Higher ranks also require higher VRAM.') + lora_alpha = gr.Slider(label='LoRA Alpha', value=64, minimum=0, maximum=2048, step=4, info='LoRA Alpha. This divided by the rank becomes the scaling of the LoRA. Higher means stronger. A good standard value is twice your Rank.') + # TODO: Better explain what this does, in terms of real world effect especially. + lora_dropout = gr.Slider(label='LoRA Dropout', minimum=0.0, maximum=1.0, step=0.025, value=0.05, info='Percentage probability for dropout of LoRA layers.') + cutoff_len = gr.Slider(label='Cutoff Length', minimum=0, maximum=2048, value=256, step=32, info='Cutoff length for text input. Essentially, how long of a line of text to feed in at a time. Higher values require drastically more VRAM.') + + with gr.Tab(label="Formatted Dataset"): + with gr.Row(): + dataset = gr.Dropdown(choices=get_dataset('training/datasets', 'json'), value='None', label='Dataset', info='The dataset file to use for training.') + ui.create_refresh_button(dataset, lambda: None, lambda: {'choices': get_dataset('training/datasets', 'json')}, 'refresh-button') + eval_dataset = gr.Dropdown(choices=get_dataset('training/datasets', 'json'), value='None', label='Evaluation Dataset', info='The (optional) dataset file used to evaluate the model after training.') + ui.create_refresh_button(eval_dataset, lambda: None, lambda: {'choices': get_dataset('training/datasets', 'json')}, 'refresh-button') + format = gr.Dropdown(choices=get_dataset('training/formats', 'json'), value='None', label='Data Format', info='The format file used to decide how to format the dataset input.') + ui.create_refresh_button(format, lambda: None, lambda: {'choices': get_dataset('training/formats', 'json')}, 'refresh-button') + + with gr.Tab(label="Raw Text File"): + with gr.Row(): + raw_text_file = gr.Dropdown(choices=get_dataset('training/datasets', 'txt'), value='None', label='Text File', info='The raw text file to use for training.') + ui.create_refresh_button(raw_text_file, lambda: None, lambda: {'choices': get_dataset('training/datasets', 'txt')}, 'refresh-button') + with gr.Row(): + overlap_len = gr.Slider(label='Overlap Length', minimum=0, maximum=512, value=128, step=16, info='Overlap length - ie how many tokens from the prior chunk of text to include into the next chunk. (The chunks themselves will be of a size determined by Cutoff Length below). Setting overlap to exactly half the cutoff length may be ideal.') + newline_favor_len = gr.Slider(label='Prefer Newline Cut Length', minimum=0, maximum=512, value=128, step=16, info='Length (in characters, not tokens) of the maximum distance to shift an overlap cut by to ensure chunks cut at newlines. If too low, cuts may occur in the middle of lines.') + + with gr.Row(): + start_button = gr.Button("Start LoRA Training") + stop_button = gr.Button("Interrupt") + + output = gr.Markdown(value="Ready") + start_button.click(do_train, [lora_name, micro_batch_size, batch_size, epochs, learning_rate, lora_rank, lora_alpha, lora_dropout, + cutoff_len, dataset, eval_dataset, format, raw_text_file, overlap_len, newline_favor_len], [output]) + stop_button.click(do_interrupt, [], [], cancels=[], queue=False) + + +def do_interrupt(): + global WANT_INTERRUPT + WANT_INTERRUPT = True + + +class Callbacks(transformers.TrainerCallback): + def on_step_begin(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs): + global CURRENT_STEPS, MAX_STEPS + CURRENT_STEPS = state.global_step * CURRENT_GRADIENT_ACCUM + MAX_STEPS = state.max_steps * CURRENT_GRADIENT_ACCUM + if WANT_INTERRUPT: + control.should_epoch_stop = True + control.should_training_stop = True + + def on_substep_end(self, args: transformers.TrainingArguments, state: transformers.TrainerState, control: transformers.TrainerControl, **kwargs): + global CURRENT_STEPS + CURRENT_STEPS += 1 + if WANT_INTERRUPT: + control.should_epoch_stop = True + control.should_training_stop = True + + +def clean_path(base_path: str, path: str): + """"Strips unusual symbols and forcibly builds a path as relative to the intended directory.""" + # TODO: Probably could do with a security audit to guarantee there's no ways this can be bypassed to target an unwanted path. + # Or swap it to a strict whitelist of [a-zA-Z_0-9] + path = path.replace('\\', '/').replace('..', '_') + if base_path is None: + return path + return f'{Path(base_path).absolute()}/{path}' + + +def do_train(lora_name: str, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lora_rank: int, lora_alpha: int, lora_dropout: float, + cutoff_len: int, dataset: str, eval_dataset: str, format: str, raw_text_file: str, overlap_len: int, newline_favor_len: int): + global WANT_INTERRUPT, CURRENT_STEPS, MAX_STEPS, CURRENT_GRADIENT_ACCUM + WANT_INTERRUPT = False + CURRENT_STEPS = 0 + MAX_STEPS = 0 + + # == Input validation / processing == + yield "Prepping..." + lora_name = f"{shared.args.lora_dir}/{clean_path(None, lora_name)}" + actual_lr = float(learning_rate) + + model_type = type(shared.model).__name__ + if model_type != "LlamaForCausalLM": + if model_type == "PeftModelForCausalLM": + yield "You are trying to train a LoRA while you already have another LoRA loaded. This will work, but may have unexpected effects. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*" + print("Warning: Training LoRA over top of another LoRA. May have unexpected effects.") + else: + yield "LoRA training has only currently been validated for LLaMA models. Unexpected errors may follow. *(Will continue anyway in 5 seconds, press `Interrupt` to stop.)*" + print(f"Warning: LoRA training has only currently been validated for LLaMA models. (Found model type: {model_type})") + time.sleep(5) + + if shared.args.wbits > 0 or shared.args.gptq_bits > 0: + yield "LoRA training does not yet support 4bit. Please use `--load-in-8bit` for now." + return + + elif not shared.args.load_in_8bit: + yield "It is highly recommended you use `--load-in-8bit` for LoRA training. *(Will continue anyway in 2 seconds, press `Interrupt` to stop.)*" + print("Warning: It is highly recommended you use `--load-in-8bit` for LoRA training.") + time.sleep(2) # Give it a moment for the message to show in UI before continuing + + if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0: + yield "Cannot input zeroes." + return + + gradient_accumulation_steps = batch_size // micro_batch_size + CURRENT_GRADIENT_ACCUM = gradient_accumulation_steps + shared.tokenizer.pad_token = 0 + shared.tokenizer.padding_side = "left" + + def tokenize(prompt): + result = shared.tokenizer(prompt, truncation=True, max_length=cutoff_len + 1, padding="max_length") + return { + "input_ids": result["input_ids"][:-1], + "attention_mask": result["attention_mask"][:-1], + } + + # == Prep the dataset, format, etc == + if raw_text_file not in ['None', '']: + print("Loading raw text file dataset...") + with open(clean_path('training/datasets', f'{raw_text_file}.txt'), 'r', encoding='utf-8') as file: + raw_text = file.read() + tokens = shared.tokenizer.encode(raw_text) + del raw_text # Note: could be a gig for a large dataset, so delete redundant data as we go to be safe on RAM + + tokens = list(split_chunks(tokens, cutoff_len - overlap_len)) + for i in range(1, len(tokens)): + tokens[i] = tokens[i - 1][-overlap_len:] + tokens[i] + text_chunks = [shared.tokenizer.decode(x) for x in tokens] + del tokens + + if newline_favor_len > 0: + text_chunks = [cut_chunk_for_newline(x, newline_favor_len) for x in text_chunks] + + train_data = Dataset.from_list([tokenize(x) for x in text_chunks]) + del text_chunks + train_data = train_data.shuffle() + eval_data = None + + else: + if dataset in ['None', '']: + yield "**Missing dataset choice input, cannot continue.**" + return + + if format in ['None', '']: + yield "**Missing format choice input, cannot continue.**" + return + + with open(clean_path('training/formats', f'{format}.json'), 'r') as formatFile: + format_data: dict[str, str] = json.load(formatFile) + + def generate_prompt(data_point: dict[str, str]): + for options, data in format_data.items(): + if set(options.split(',')) == set(x[0] for x in data_point.items() if len(x[1].strip()) > 0): + for key, val in data_point.items(): + data = data.replace(f'%{key}%', val) + return data + raise RuntimeError(f'Data-point "{data_point}" has no keyset match within format "{list(format_data.keys())}"') + + def generate_and_tokenize_prompt(data_point): + prompt = generate_prompt(data_point) + return tokenize(prompt) + + print("Loading JSON datasets...") + data = load_dataset("json", data_files=clean_path('training/datasets', f'{dataset}.json')) + train_data = data['train'].shuffle().map(generate_and_tokenize_prompt) + + if eval_dataset == 'None': + eval_data = None + else: + eval_data = load_dataset("json", data_files=clean_path('training/datasets', f'{eval_dataset}.json')) + eval_data = eval_data['train'].shuffle().map(generate_and_tokenize_prompt) + + # == Start prepping the model itself == + if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'): + print("Getting model ready...") + prepare_model_for_int8_training(shared.model) + + print("Prepping for training...") + config = LoraConfig( + r=lora_rank, + lora_alpha=lora_alpha, + # TODO: Should target_modules be configurable? + target_modules=["q_proj", "v_proj"], + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM" + ) + + try: + lora_model = get_peft_model(shared.model, config) + except: + yield traceback.format_exc() + return + + trainer = transformers.Trainer( + model=lora_model, + train_dataset=train_data, + eval_dataset=eval_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=micro_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + # TODO: Should more of these be configurable? Probably. + warmup_steps=100, + num_train_epochs=epochs, + learning_rate=actual_lr, + fp16=True, + logging_steps=20, + evaluation_strategy="steps" if eval_data is not None else "no", + save_strategy="steps", + eval_steps=200 if eval_data is not None else None, + save_steps=200, + output_dir=lora_name, + save_total_limit=3, + load_best_model_at_end=True if eval_data is not None else False, + # TODO: Enable multi-device support + ddp_find_unused_parameters=None + ), + data_collator=transformers.DataCollatorForLanguageModeling(shared.tokenizer, mlm=False), + callbacks=list([Callbacks()]) + ) + + lora_model.config.use_cache = False + old_state_dict = lora_model.state_dict + lora_model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict()) + ).__get__(lora_model, type(lora_model)) + + if torch.__version__ >= "2" and sys.platform != "win32": + lora_model = torch.compile(lora_model) + + # == Main run and monitor loop == + # TODO: save/load checkpoints to resume from? + print("Starting training...") + yield "Starting..." + if WANT_INTERRUPT: + yield "Interrupted before start." + return + + def threaded_run(): + trainer.train() + + thread = threading.Thread(target=threaded_run) + thread.start() + last_step = 0 + start_time = time.perf_counter() + + while thread.is_alive(): + time.sleep(0.5) + if WANT_INTERRUPT: + yield "Interrupting, please wait... *(Run will stop after the current training step completes.)*" + + elif CURRENT_STEPS != last_step: + last_step = CURRENT_STEPS + time_elapsed = time.perf_counter() - start_time + if time_elapsed <= 0: + timer_info = "" + total_time_estimate = 999 + else: + its = CURRENT_STEPS / time_elapsed + if its > 1: + timer_info = f"`{its:.2f}` it/s" + else: + timer_info = f"`{1.0/its:.2f}` s/it" + total_time_estimate = (1.0 / its) * (MAX_STEPS) + yield f"Running... **{CURRENT_STEPS}** / **{MAX_STEPS}** ... {timer_info}, {format_time(time_elapsed)} / {format_time(total_time_estimate)} ... {format_time(total_time_estimate - time_elapsed)} remaining" + + print("Training complete, saving...") + lora_model.save_pretrained(lora_name) + + if WANT_INTERRUPT: + print("Training interrupted.") + yield f"Interrupted. Incomplete LoRA saved to `{lora_name}`" + else: + print("Training complete!") + yield f"Done! LoRA saved to `{lora_name}`" + + +def split_chunks(arr, step): + for i in range(0, len(arr), step): + yield arr[i:i + step] + + +def cut_chunk_for_newline(chunk: str, max_length: int): + if '\n' not in chunk: + return chunk + first_newline = chunk.index('\n') + if first_newline < max_length: + chunk = chunk[first_newline + 1:] + if '\n' not in chunk: + return chunk + last_newline = chunk.rindex('\n') + if len(chunk) - last_newline < max_length: + chunk = chunk[:last_newline] + return chunk + + +def format_time(seconds: float): + if seconds < 120: + return f"`{seconds:.0f}` seconds" + minutes = seconds / 60 + if minutes < 120: + return f"`{minutes:.0f}` minutes" + hours = minutes / 60 + return f"`{hours:.0f}` hours" diff --git a/text-generation-webui/modules/ui.py b/text-generation-webui/modules/ui.py new file mode 100644 index 0000000000000000000000000000000000000000..def1faaf78ac6f14f02ccab036008ea0d4fe95c3 --- /dev/null +++ b/text-generation-webui/modules/ui.py @@ -0,0 +1,43 @@ +from pathlib import Path + +import gradio as gr + +refresh_symbol = '\U0001f504' # 🔄 + +with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f: + css = f.read() +with open(Path(__file__).resolve().parent / '../css/chat.css', 'r') as f: + chat_css = f.read() +with open(Path(__file__).resolve().parent / '../css/main.js', 'r') as f: + main_js = f.read() +with open(Path(__file__).resolve().parent / '../css/chat.js', 'r') as f: + chat_js = f.read() + + +class ToolButton(gr.Button, gr.components.FormComponent): + """Small button with single emoji as text, fits inside gradio forms""" + + def __init__(self, **kwargs): + super().__init__(variant="tool", **kwargs) + + def get_block_name(self): + return "button" + + +def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_id): + def refresh(): + refresh_method() + args = refreshed_args() if callable(refreshed_args) else refreshed_args + + for k, v in args.items(): + setattr(refresh_component, k, v) + + return gr.update(**(args or {})) + + refresh_button = ToolButton(value=refresh_symbol, elem_id=elem_id) + refresh_button.click( + fn=refresh, + inputs=[], + outputs=[refresh_component] + ) + return refresh_button diff --git a/text-generation-webui/presets/Contrastive Search.txt b/text-generation-webui/presets/Contrastive Search.txt new file mode 100644 index 0000000000000000000000000000000000000000..832bc9caf9b744d9d9c728f88d887f012a56ba3e --- /dev/null +++ b/text-generation-webui/presets/Contrastive Search.txt @@ -0,0 +1,3 @@ +do_sample=False +penalty_alpha=0.6 +top_k=4 diff --git a/text-generation-webui/presets/Debug-deterministic.txt b/text-generation-webui/presets/Debug-deterministic.txt new file mode 100644 index 0000000000000000000000000000000000000000..6673b71c8164effc401a486055b7f9a021b2acfb --- /dev/null +++ b/text-generation-webui/presets/Debug-deterministic.txt @@ -0,0 +1 @@ +do_sample=False diff --git a/text-generation-webui/presets/Default.txt b/text-generation-webui/presets/Default.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5283836d8ac712521f958ea93ccbf64076b6c55 --- /dev/null +++ b/text-generation-webui/presets/Default.txt @@ -0,0 +1,7 @@ +do_sample=True +top_p=0.5 +top_k=40 +temperature=0.7 +repetition_penalty=1.2 +typical_p=1.0 +early_stopping=False diff --git a/text-generation-webui/presets/Kobold-Godlike.txt b/text-generation-webui/presets/Kobold-Godlike.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ba5b794b6d0130a1fa1d918bda9a276f7d23367 --- /dev/null +++ b/text-generation-webui/presets/Kobold-Godlike.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.5 +top_k=0 +temperature=0.7 +repetition_penalty=1.1 +typical_p=0.19 diff --git a/text-generation-webui/presets/Kobold-Liminal Drift.txt b/text-generation-webui/presets/Kobold-Liminal Drift.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4dd3bd7a70af2d4eb6c847bed6bedee5379dce --- /dev/null +++ b/text-generation-webui/presets/Kobold-Liminal Drift.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=1.0 +top_k=0 +temperature=0.66 +repetition_penalty=1.1 +typical_p=0.6 diff --git a/text-generation-webui/presets/LLaMA-Precise.txt b/text-generation-webui/presets/LLaMA-Precise.txt new file mode 100644 index 0000000000000000000000000000000000000000..8098b390a097fc9438a2a82ec2bdd58adb2a771b --- /dev/null +++ b/text-generation-webui/presets/LLaMA-Precise.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.1 +top_k=40 +temperature=0.7 +repetition_penalty=1.18 +typical_p=1.0 diff --git a/text-generation-webui/presets/Naive.txt b/text-generation-webui/presets/Naive.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa8c058224c533f4084e230f6bbf77b63d5e81ea --- /dev/null +++ b/text-generation-webui/presets/Naive.txt @@ -0,0 +1,4 @@ +do_sample=True +temperature=0.7 +top_p=0.85 +top_k=50 diff --git a/text-generation-webui/presets/NovelAI-Best Guess.txt b/text-generation-webui/presets/NovelAI-Best Guess.txt new file mode 100644 index 0000000000000000000000000000000000000000..db3fa75b2a11d7e29b108177f9894e82d1e52126 --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Best Guess.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.9 +top_k=100 +temperature=0.8 +repetition_penalty=1.15 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Decadence.txt b/text-generation-webui/presets/NovelAI-Decadence.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3109f3e3f3a021810d171a0b98f615766b57e4b --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Decadence.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=1.0 +top_k=100 +temperature=2 +repetition_penalty=1 +typical_p=0.97 diff --git a/text-generation-webui/presets/NovelAI-Genesis.txt b/text-generation-webui/presets/NovelAI-Genesis.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc7376b3b981a260448a65cd3c00c7b3904308e2 --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Genesis.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.98 +top_k=0 +temperature=0.63 +repetition_penalty=1.05 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Lycaenidae.txt b/text-generation-webui/presets/NovelAI-Lycaenidae.txt new file mode 100644 index 0000000000000000000000000000000000000000..0134569cef76bc0de6b3dc7885d94d9d9afdfd62 --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Lycaenidae.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.85 +top_k=12 +temperature=2 +repetition_penalty=1.15 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Ouroboros.txt b/text-generation-webui/presets/NovelAI-Ouroboros.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e944b54e78e1f63bd4bb6f56a717e0fec751c6b --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Ouroboros.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=1.0 +top_k=100 +temperature=1.07 +repetition_penalty=1.05 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Pleasing Results.txt b/text-generation-webui/presets/NovelAI-Pleasing Results.txt new file mode 100644 index 0000000000000000000000000000000000000000..330114a25db6d194dbc8689bf5476a81f649cf64 --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Pleasing Results.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=1.0 +top_k=0 +temperature=0.44 +repetition_penalty=1.15 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Sphinx Moth.txt b/text-generation-webui/presets/NovelAI-Sphinx Moth.txt new file mode 100644 index 0000000000000000000000000000000000000000..bace1e24b5dcc64fdde99097930f41a991e91b8e --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Sphinx Moth.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.18 +top_k=30 +temperature=2.0 +repetition_penalty=1.15 +typical_p=1.0 diff --git a/text-generation-webui/presets/NovelAI-Storywriter.txt b/text-generation-webui/presets/NovelAI-Storywriter.txt new file mode 100644 index 0000000000000000000000000000000000000000..2df5f8181458c642ed4691925ade3d542de5391c --- /dev/null +++ b/text-generation-webui/presets/NovelAI-Storywriter.txt @@ -0,0 +1,6 @@ +do_sample=True +top_p=0.73 +top_k=0 +temperature=0.72 +repetition_penalty=1.1 +typical_p=1.0 diff --git a/text-generation-webui/presets/Verbose (Beam Search).txt b/text-generation-webui/presets/Verbose (Beam Search).txt new file mode 100644 index 0000000000000000000000000000000000000000..464a4a5f0dda62348fda2cbbba4a98036c744d5c --- /dev/null +++ b/text-generation-webui/presets/Verbose (Beam Search).txt @@ -0,0 +1,9 @@ +num_beams=10 +min_length=200 +length_penalty=1.4 +no_repeat_ngram_size=2 +early_stopping=True +temperature=0.7 +top_k=150 +top_p=0.92 +repetition_penalty=4.5 diff --git a/text-generation-webui/prompts/Alpaca.txt b/text-generation-webui/prompts/Alpaca.txt new file mode 100644 index 0000000000000000000000000000000000000000..8434a80c3bcf35c5c62698ae31174f20f822cb6d --- /dev/null +++ b/text-generation-webui/prompts/Alpaca.txt @@ -0,0 +1,6 @@ +Below is an instruction that describes a task. Write a response that appropriately completes the request. +### Instruction: +Write a poem about the transformers Python library. +Mention the word "large language models" in that poem. +### Response: + diff --git a/text-generation-webui/prompts/GPT-4chan.txt b/text-generation-webui/prompts/GPT-4chan.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bc8c7f4613f982e3dfa367562a764cf5bd4c73b --- /dev/null +++ b/text-generation-webui/prompts/GPT-4chan.txt @@ -0,0 +1,6 @@ +----- +--- 865467536 +Hello, AI frens! +How are you doing on this fine day? +--- 865467537 + diff --git a/text-generation-webui/prompts/Open Assistant.txt b/text-generation-webui/prompts/Open Assistant.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf1ae4a2d0723afc8adee24fa496bafeaba0f492 --- /dev/null +++ b/text-generation-webui/prompts/Open Assistant.txt @@ -0,0 +1 @@ +<|prompter|>Write a story about future of AI development<|endoftext|><|assistant|> diff --git a/text-generation-webui/prompts/QA.txt b/text-generation-webui/prompts/QA.txt new file mode 100644 index 0000000000000000000000000000000000000000..32b0e2350f3c0a7f447dcd1aba11d6ae2247e5a8 --- /dev/null +++ b/text-generation-webui/prompts/QA.txt @@ -0,0 +1,4 @@ +Common sense questions and answers + +Question: +Factual answer: diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/README.md b/text-generation-webui/repositories/GPTQ-for-LLaMa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a27271560099c0908cd9fb8b85dea36e7024340d --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/README.md @@ -0,0 +1,142 @@ +# GPTQ-for-LLaMA +4 bits quantization of [LLaMA](https://arxiv.org/abs/2302.13971) using [GPTQ](https://arxiv.org/abs/2210.17323) + +GPTQ is SOTA one-shot weight quantization method + +**This code is based on [GPTQ](https://github.com/IST-DASLab/gptq)** + +## New Features +Changed to support new features proposed by [GPTQ](https://github.com/IST-DASLab/gptq#new-features). + +* Slightly adjusted preprocessing of C4 and PTB for more realistic evaluations (used in our updated results); can be activated via the flag --new-eval. +* two new tricks:--act-order (quantizing columns in order of decreasing activation size) and --true-sequential (performing sequential quantization even within a single Transformer block). Those fix GPTQ's strangely bad performance on the 7B model (from 7.15 to 6.09 Wiki2 PPL) and lead to slight improvements on most models/settings in general. +* Changed to use an optimized kernel that uses [triton](https://github.com/openai/triton). +* Changed the code to support backward. + +**Due to Triton's limitations, 3-bit is not supported. If you are using 3bit, please use [cuda branch](https://github.com/qwopqwop200/GPTQ-for-LLaMa/tree/cuda).** + +Triton only supports Linux, so if you are a Windows user, please use [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install). + +## Result +
+LLaMA-7B(click me) + +| [LLaMA-7B](https://arxiv.org/abs/2302.13971) | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) | +| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- | +| FP16 | 16 | - | 13940 | 5.68 | 12.5 | +| RTN | 4 | - | - | 6.29 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | - | 4740 | 6.09 | 3.5 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | 128 | 4891 | 5.85 | 3.6 | +| RTN | 3 | - | - | 25.54 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | - | 3852 | 8.07 | 2.7 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | 128 | 4116 | 6.61 | 3.0 | + +
+ +
+LLaMA-13B + +| [LLaMA-13B](https://arxiv.org/abs/2302.13971) | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) | +| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- | +| FP16 | 16 | - | OOM | 5.09 | 24.2 | +| RTN | 4 | - | - | 5.53 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | - | 8410 | 5.36 | 6.5 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | 128 | 8747 | 5.20 | 6.7 | +| RTN | 3 | - | - | 11.40 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | - | 6870 | 6.63 | 5.1 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | 128 | 7277 | 5.62 | 5.4 | + +
+ +
+LLaMA-33B + +| [LLaMA-33B](https://arxiv.org/abs/2302.13971) | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) | +| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- | +| FP16 | 16 | - | OOM | 4.10 | 60.5 | +| RTN | 4 | - | - | 4.54 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | - | 19493 | 4.45 | 15.7 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | 128 | 20570 | 4.23 | 16.3 | +| RTN | 3 | - | - | 14.89 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | - | 15493 | 5.69 | 12.0 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | 128 | 16566 | 4.80 | 13.0 | + +
+ +
+LLaMA-65B + +| [LLaMA-65B](https://arxiv.org/abs/2302.13971) | Bits | group-size | memory(MiB) | Wikitext2 | checkpoint size(GB) | +| -------------------------------------------------- | ---- | ---------- | ----------- | --------- | ------------------- | +| FP16 | 16 | - | OOM | 3.53 | 121.0 | +| RTN | 4 | - | - | 3.92 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | - | OOM | 3.84 | 31.1 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 4 | 128 | OOM | 3.65 | 32.3 | +| RTN | 3 | - | - | 10.59 | - | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | - | OOM | 5.04 | 23.6 | +| [GPTQ](https://arxiv.org/abs/2210.17323) | 3 | 128 | OOM | 4.17 | 25.6 | +
+ +Quantization requires a large amount of CPU memory. However, the memory required can be reduced by using swap memory. + +Depending on the GPUs/drivers, there may be a difference in performance, which decreases as the model size increases.(https://github.com/IST-DASLab/gptq/issues/1) + +According to [GPTQ paper](https://arxiv.org/abs/2210.17323), As the size of the model increases, the difference in performance between FP16 and GPTQ decreases. + +## Installation +If you don't have [conda](https://docs.conda.io/en/latest/miniconda.html), install it first. +``` +conda create --name gptq python=3.9 -y +conda activate gptq +conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia +# Or, if you're having trouble with conda, use pip with python3.9: +# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117 + +git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa +cd GPTQ-for-LLaMa +pip install -r requirements.txt +``` +## Dependencies + +* `torch`: tested on v2.0.0+cu117 +* `transformers`: tested on v4.28.0.dev0 +* `datasets`: tested on v2.10.1 +* `safetensors`: tested on v0.3.0 + +All experiments were run on a single NVIDIA RTX3090. + +# Language Generation +## LLaMA + +``` +#convert LLaMA to hf +python convert_llama_weights_to_hf.py --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir ./llama-hf + +# Benchmark language generation with 4-bit LLaMA-7B: + +# Save compressed model +CUDA_VISIBLE_DEVICES=0 python llama.py ./llama-hf/llama-7b c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save llama7b-4bit-128g.pt +# Or save compressed `.safetensors` model +CUDA_VISIBLE_DEVICES=0 python llama.py ./llama-hf/llama-7b c4 --wbits 4 --true-sequential --act-order --groupsize 128 --save_safetensors llama7b-4bit-128g.safetensors + +# Benchmark generating a 2048 token sequence with the saved model +CUDA_VISIBLE_DEVICES=0 python llama.py ./llama-hf/llama-7b c4 --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --benchmark 2048 --check +# Benchmark FP16 baseline, note that the model will be split across all listed GPUs +CUDA_VISIBLE_DEVICES=0,1,2,3,4 python llama.py ./llama-hf/llama-7b c4 --benchmark 2048 --check + +# model inference with the saved model +CUDA_VISIBLE_DEVICES=0 python llama_inference.py ./llama-hf/llama-7b --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama" +# model inference with the saved model using safetensors loaded direct to gpu +CUDA_VISIBLE_DEVICES=0 python llama_inference.py ./llama-hf/llama-7b --wbits 4 --groupsize 128 --load llama7b-4bit-128g.safetensors --text "this is llama" --device=0 +# model inference with the saved model with offload(This is very slow. This is a simple implementation and could be improved with technologies like flexgen(https://github.com/FMInference/FlexGen). +CUDA_VISIBLE_DEVICES=0 python llama_inference_offload.py ./llama-hf/llama-7b --wbits 4 --groupsize 128 --load llama7b-4bit-128g.pt --text "this is llama" --pre_layer 16 +It takes about 180 seconds to generate 45 tokens(5->50 tokens) on single RTX3090 based on LLaMa-65B. pre_layer is set to 50. +``` +Basically, 4-bit quantization and 128 groupsize are recommended. + +# Acknowledgements +This code is based on [GPTQ](https://github.com/IST-DASLab/gptq) + +Thanks to Meta AI for releasing [LLaMA](https://arxiv.org/abs/2302.13971), a powerful LLM. + +Triton GPTQ kernel code is based on [GPTQ-triton](https://github.com/fpgaminer/GPTQ-triton) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/convert_llama_weights_to_hf.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/convert_llama_weights_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..f85d529b2065915805a368c451d608a817d0e876 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/convert_llama_weights_to_hf.py @@ -0,0 +1,33 @@ +import argparse +import os +from transformers.models.llama.convert_llama_weights_to_hf import write_model,write_tokenizer + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_dir", + help="Location of LLaMA weights, which contains tokenizer.model and model folders", + ) + parser.add_argument( + "--model_size", + choices=["7B", "13B", "30B", "65B", "tokenizer_only"], + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model and tokenizer", + ) + args = parser.parse_args() + if args.model_size != "tokenizer_only": + write_model( + model_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()), + input_base_path=os.path.join(args.input_dir, args.model_size), + model_size=args.model_size, + ) + write_tokenizer( + tokenizer_path=os.path.join(args.output_dir, "llama-{}".format(args.model_size).lower()), + input_tokenizer_path=os.path.join(args.input_dir, "tokenizer.model"), + ) + + +if __name__ == "__main__": + main() diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/custom_autotune.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/custom_autotune.py new file mode 100644 index 0000000000000000000000000000000000000000..ed8ee24932d2c8322dc2a34b27981cbebf64608c --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/custom_autotune.py @@ -0,0 +1,167 @@ +#https://github.com/fpgaminer/GPTQ-triton +""" +Mostly the same as the autotuner in Triton, but with a few changes like using 40 runs instead of 100. +""" + +import builtins +import math +import time +from typing import Dict + +import triton + + +class Autotuner(triton.KernelInterface): + def __init__(self, fn, arg_names, configs, key, reset_to_zero, prune_configs_by: Dict = None, nearest_power_of_two: bool = False): + ''' + :param prune_configs_by: a dict of functions that are used to prune configs, fields: + 'perf_model': performance model used to predicate running time with different configs, returns running time + 'top_k': number of configs to bench + 'prune_num_stages_by'(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs. + 'nearest_power_of_two'(optional): whether to round key arguments to the nearest power of two when caching tuning results + ''' + if not configs: + self.configs = [triton.Config({}, num_warps=4, num_stages=2)] + else: + self.configs = configs + self.key_idx = [arg_names.index(k) for k in key] + self.nearest_power_of_two = nearest_power_of_two + self.cache = {} + # hook to reset all required tensor to zeros before relaunching a kernel + self.hook = lambda args: 0 + if reset_to_zero is not None: + self.reset_idx = [arg_names.index(k) for k in reset_to_zero] + + def _hook(args): + for i in self.reset_idx: + args[i].zero_() + self.hook = _hook + self.arg_names = arg_names + # prune configs + if prune_configs_by: + perf_model, top_k = prune_configs_by['perf_model'], prune_configs_by['top_k'] + if 'early_config_prune' in prune_configs_by: + early_config_prune = prune_configs_by['early_config_prune'] + else: + perf_model, top_k, early_config_prune = None, None, None + self.perf_model, self.configs_top_k = perf_model, top_k + self.early_config_prune = early_config_prune + self.fn = fn + + def _bench(self, *args, config, **meta): + # check for conflicts, i.e. meta-parameters both provided + # as kwargs and by the autotuner + conflicts = meta.keys() & config.kwargs.keys() + if conflicts: + raise ValueError( + f"Conflicting meta-parameters: {', '.join(conflicts)}." + " Make sure that you don't re-define auto-tuned symbols." + ) + # augment meta-parameters with tunable ones + current = dict(meta, **config.kwargs) + + def kernel_call(): + if config.pre_hook: + config.pre_hook(self.nargs) + self.hook(args) + self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **current) + try: + # In testings using only 40 reps seems to be close enough and it appears to be what PyTorch uses + # PyTorch also sets fast_flush to True, but I didn't see any speedup so I'll leave the default + return triton.testing.do_bench(kernel_call, rep=40) + except triton.compiler.OutOfResources: + return float('inf') + + def run(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + if len(self.configs) > 1: + key = tuple(args[i] for i in self.key_idx) + + # This reduces the amount of autotuning by rounding the keys to the nearest power of two + # In my testing this gives decent results, and greatly reduces the amount of tuning required + if self.nearest_power_of_two: + key = tuple([2 ** int(math.log2(x) + 0.5) for x in key]) + + if key not in self.cache: + # prune configs + pruned_configs = self.prune_configs(kwargs) + bench_start = time.time() + timings = {config: self._bench(*args, config=config, **kwargs) + for config in pruned_configs} + bench_end = time.time() + self.bench_time = bench_end - bench_start + self.cache[key] = builtins.min(timings, key=timings.get) + self.hook(args) + self.configs_timings = timings + config = self.cache[key] + else: + config = self.configs[0] + self.best_config = config + if config.pre_hook is not None: + config.pre_hook(self.nargs) + return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs) + + def prune_configs(self, kwargs): + pruned_configs = self.configs + if self.early_config_prune: + pruned_configs = self.early_config_prune(self.configs, self.nargs) + if self.perf_model: + top_k = self.configs_top_k + if isinstance(top_k, float) and top_k <= 1.0: + top_k = int(len(self.configs) * top_k) + if len(pruned_configs) > top_k: + est_timing = { + config: self.perf_model(**self.nargs, **kwargs, **config.kwargs, num_stages=config.num_stages, + num_warps=config.num_warps) + for config in pruned_configs + } + pruned_configs = sorted(est_timing.keys(), key=lambda x: est_timing[x])[:top_k] + return pruned_configs + + def warmup(self, *args, **kwargs): + self.nargs = dict(zip(self.arg_names, args)) + for config in self.prune_configs(kwargs): + self.fn.warmup( + *args, + num_warps=config.num_warps, + num_stages=config.num_stages, + **kwargs, + **config.kwargs, + ) + self.nargs = None + + +def autotune(configs, key, prune_configs_by=None, reset_to_zero=None, nearest_power_of_two=False): + """ + Decorator for auto-tuning a :code:`triton.jit`'d function. + .. highlight:: python + .. code-block:: python + @triton.autotune(configs=[ + triton.Config(meta={'BLOCK_SIZE': 128}, num_warps=4), + triton.Config(meta={'BLOCK_SIZE': 1024}, num_warps=8), + ], + key=['x_size'] # the two above configs will be evaluated anytime + # the value of x_size changes + ) + @triton.jit + def kernel(x_ptr, x_size, **META): + BLOCK_SIZE = META['BLOCK_SIZE'] + :note: When all the configurations are evaluated, the kernel will run multiple time. + This means that whatever value the kernel updates will be updated multiple times. + To avoid this undesired behavior, you can use the `reset_to_zero` argument, which + reset the value of the provided tensor to `zero` before running any configuration. + :param configs: a list of :code:`triton.Config` objects + :type configs: list[triton.Config] + :param key: a list of argument names whose change in value will trigger the evaluation of all provided configs. + :type key: list[str] + :param prune_configs_by: a dict of functions that are used to prune configs, fields: + 'perf_model': performance model used to predicate running time with different configs, returns running time + 'top_k': number of configs to bench + 'early_config_prune'(optional): a function used to do early prune (eg, num_stages). It take configs:List[Config] as its input, and returns pruned configs. + :param reset_to_zero: a list of argument names whose value will be reset to zero before evaluating any configs. + :type reset_to_zero: list[str] + """ + def decorator(fn): + return Autotuner(fn, fn.arg_names, configs, key, reset_to_zero, prune_configs_by, nearest_power_of_two) + + return decorator diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/datautils.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/datautils.py new file mode 100644 index 0000000000000000000000000000000000000000..33ea40f7699857c24cedd4bfaa9aaea49eb1cbd9 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/datautils.py @@ -0,0 +1,175 @@ +import numpy as np +import torch + + +def set_seed(seed): + np.random.seed(seed) + torch.random.manual_seed(seed) + + +def get_wikitext2(nsamples, seed, seqlen, model): + from datasets import load_dataset + traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train') + testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test') + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) + trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt') + testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + +def get_ptb(nsamples, seed, seqlen, model): + from datasets import load_dataset + traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') + valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation') + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) + trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt') + testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + +def get_c4(nsamples, seed, seqlen, model): + from datasets import load_dataset + traindata = load_dataset( + 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train', use_auth_token=False + ) + valdata = load_dataset( + 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation', use_auth_token=False + ) + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + + import random + random.seed(0) + valenc = [] + for _ in range(256): + while True: + i = random.randint(0, len(valdata) - 1) + tmp = tokenizer(valdata[i]['text'], return_tensors='pt') + if tmp.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, tmp.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + valenc.append(tmp.input_ids[:, i:j]) + valenc = torch.hstack(valenc) + class TokenizerWrapper: + def __init__(self, input_ids): + self.input_ids = input_ids + valenc = TokenizerWrapper(valenc) + + return trainloader, valenc + + + +def get_ptb_new(nsamples, seed, seqlen, model): + from datasets import load_dataset + traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train') + testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test') + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) + trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt') + testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt') + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + return trainloader, testenc + +def get_c4_new(nsamples, seed, seqlen, model): + from datasets import load_dataset + traindata = load_dataset( + 'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train' + ) + valdata = load_dataset( + 'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation' + ) + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) + + import random + random.seed(seed) + trainloader = [] + for _ in range(nsamples): + while True: + i = random.randint(0, len(traindata) - 1) + trainenc = tokenizer(traindata[i]['text'], return_tensors='pt') + if trainenc.input_ids.shape[1] >= seqlen: + break + i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1) + j = i + seqlen + inp = trainenc.input_ids[:, i:j] + tar = inp.clone() + tar[:, :-1] = -100 + trainloader.append((inp, tar)) + + valenc = tokenizer(' '.join(valdata[:1100]['text']), return_tensors='pt') + valenc = valenc.input_ids[:, :(256 * seqlen)] + + class TokenizerWrapper: + def __init__(self, input_ids): + self.input_ids = input_ids + valenc = TokenizerWrapper(valenc) + + return trainloader, valenc +def get_loaders( + name, nsamples=128, seed=0, seqlen=2048, model='' +): + if 'wikitext2' in name: + return get_wikitext2(nsamples, seed, seqlen, model) + if 'ptb' in name: + if 'new' in name: + return get_ptb_new(nsamples, seed, seqlen, model) + return get_ptb(nsamples, seed, seqlen, model) + if 'c4' in name: + if 'new' in name: + return get_c4_new(nsamples, seed, seqlen, model) + return get_c4(nsamples, seed, seqlen, model) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/gptq.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..9ae3d4b739b36c3376cf4c603e0ade6bea7c2bc3 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/gptq.py @@ -0,0 +1,177 @@ +import math +import time + +import torch +import torch.nn as nn +import transformers + +from quant import * + + +DEBUG = False + +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False + + +class GPTQ: + def __init__(self, layer): + self.layer = layer + self.dev = self.layer.weight.device + W = layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + self.rows = W.shape[0] + self.columns = W.shape[1] + self.H = torch.zeros((self.columns, self.columns), device=self.dev) + self.nsamples = 0 + + def add_batch(self, inp, out): + if DEBUG: + self.inp1 = inp + self.out1 = out + if len(inp.shape) == 2: + inp = inp.unsqueeze(0) + tmp = inp.shape[0] + if isinstance(self.layer, nn.Linear) or isinstance(self.layer, transformers.Conv1D): + if len(inp.shape) == 3: + inp = inp.reshape((-1, inp.shape[-1])) + inp = inp.t() + if isinstance(self.layer, nn.Conv2d): + unfold = nn.Unfold( + self.layer.kernel_size, + dilation=self.layer.dilation, + padding=self.layer.padding, + stride=self.layer.stride + ) + inp = unfold(inp) + inp = inp.permute([1, 0, 2]) + inp = inp.flatten(1) + self.H *= self.nsamples / (self.nsamples + tmp) + self.nsamples += tmp + # inp = inp.float() + inp = math.sqrt(2 / self.nsamples) * inp.float() + # self.H += 2 / self.nsamples * inp.matmul(inp.t()) + self.H += inp.matmul(inp.t()) + + def fasterquant( + self, blocksize=128, percdamp=.01, groupsize=-1, actorder=False + ): + W = self.layer.weight.data.clone() + if isinstance(self.layer, nn.Conv2d): + W = W.flatten(1) + if isinstance(self.layer, transformers.Conv1D): + W = W.t() + W = W.float() + + tick = time.time() + + if not self.quantizer.ready(): + self.quantizer.find_params(W, weight=True) + + H = self.H + del self.H + dead = torch.diag(H) == 0 + H[dead, dead] = 1 + W[:, dead] = 0 + + if actorder: + perm = torch.argsort(torch.diag(H), descending=True) + W = W[:, perm] + H = H[perm][:, perm] + + Losses = torch.zeros_like(W) + Q = torch.zeros_like(W) + + damp = percdamp * torch.mean(torch.diag(H)) + diag = torch.arange(self.columns, device=self.dev) + H[diag, diag] += damp + H = torch.linalg.cholesky(H) + H = torch.cholesky_inverse(H) + H = torch.linalg.cholesky(H, upper=True) + Hinv = H + + g_idx = [] + scale = [] + zero = [] + now_idx = 1 + + for i1 in range(0, self.columns, blocksize): + i2 = min(i1 + blocksize, self.columns) + count = i2 - i1 + + W1 = W[:, i1:i2].clone() + Q1 = torch.zeros_like(W1) + Err1 = torch.zeros_like(W1) + Losses1 = torch.zeros_like(W1) + Hinv1 = Hinv[i1:i2, i1:i2] + + for i in range(count): + w = W1[:, i] + d = Hinv1[i, i] + + if groupsize != -1: + if (i1 + i) % groupsize == 0: + self.quantizer.find_params(W[:, (i1 + i):(i1 + i + groupsize)], weight=True) + + if ((i1 + i) // groupsize) - now_idx == -1: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + now_idx += 1 + + q = quantize( + w.unsqueeze(1), self.quantizer.scale, self.quantizer.zero, self.quantizer.maxq + ).flatten() + Q1[:, i] = q + Losses1[:, i] = (w - q) ** 2 / d ** 2 + + err1 = (w - q) / d + W1[:, i:] -= err1.unsqueeze(1).matmul(Hinv1[i, i:].unsqueeze(0)) + Err1[:, i] = err1 + + Q[:, i1:i2] = Q1 + Losses[:, i1:i2] = Losses1 / 2 + + W[:, i2:] -= Err1.matmul(Hinv[i1:i2, i2:]) + + if DEBUG: + self.layer.weight.data[:, :i2] = Q[:, :i2] + self.layer.weight.data[:, i2:] = W[:, i2:] + print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + print(torch.sum(Losses)) + + torch.cuda.synchronize() + print('time %.2f' % (time.time() - tick)) + print('error', torch.sum(Losses).item()) + + groupsize = groupsize if groupsize != -1 else self.columns + g_idx = [i // groupsize for i in range(self.columns)] + g_idx = torch.tensor(g_idx, dtype=torch.int32, device=Q.device) + if actorder: + invperm = torch.argsort(perm) + Q = Q[:, invperm] + g_idx = g_idx[invperm] + + if isinstance(self.layer, transformers.Conv1D): + Q = Q.t() + self.layer.weight.data = Q.reshape(self.layer.weight.shape).to(self.layer.weight.data.dtype) + if DEBUG: + print(torch.sum((self.layer(self.inp1) - self.out1) ** 2)) + + if scale == []: + scale.append(self.quantizer.scale) + zero.append(self.quantizer.zero) + scale = torch.cat(scale,dim=1) + zero = torch.cat(zero,dim=1) + return scale,zero,g_idx + + def free(self): + if DEBUG: + self.inp1 = None + self.out1 = None + self.H = None + self.Losses = None + self.Trace = None + torch.cuda.empty_cache() diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/llama.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..57fca88d1a4cd7181c2f0a72640791f95958e75a --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama.py @@ -0,0 +1,485 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +@torch.no_grad() +def llama_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + model.model.norm = model.model.norm.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + model.model.norm = model.model.norm.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + full = find_layers(layer) + if args.true_sequential: + sequential = [ + ['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], + ['self_attn.o_proj'], + ['mlp.up_proj', 'mlp.gate_proj'], + ['mlp.down_proj'] + ] + else: + sequential = [list(full.keys())] + + for names in sequential: + subset = {n: full[n] for n in names} + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids = position_ids)[0] + for h in handles: + h.remove() + + for name in subset: + print(f'Quantizing {name} in layer {i+1}/{len(layers)}...') + scale,zero,g_idx = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu()) + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids = position_ids)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def llama_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.layers + + model.model.embed_tokens = model.model.embed_tokens.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + cache['position_ids'] = kwargs['position_ids'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.embed_tokens = model.model.embed_tokens.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + position_ids = cache['position_ids'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids = position_ids)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.norm is not None: + model.model.norm = model.model.norm.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.norm is not None: + hidden_states = model.model.norm(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + +# TODO: perform packing on GPU +def llama_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero,g_idx = quantizers[name] + qlayers[name].pack(layers[name], scale, zero, g_idx) + print('Done.') + return model + +def load_quant(model, checkpoint, wbits, groupsize = -1, warmup_autotune = True): + from transformers import LlamaConfig, LlamaForCausalLM + config = LlamaConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = LlamaForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + del layers + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint), strict = False) + else: + model.load_state_dict(torch.load(checkpoint), strict = False) + if warmup_autotune: + autotune_warmup(model) + model.seqlen = 2048 + print('Done.') + + return model + +def llama_multigpu(model, gpus): + model.model.embed_tokens = model.model.embed_tokens.to(gpus[0]) + if hasattr(model.model, 'norm') and model.model.norm: + model.model.norm = model.model.norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.model.layers): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + max_memory = 0 + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + out = model( + input_ids[:, i:i+1], + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + max_memory = max(max_memory,torch.cuda.memory_allocated() / 1024 /1024) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_key_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + print('max memory(MiB):',max_memory) + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='llama model to load' + ) + parser.add_argument( + 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--trits', action='store_true', + help='Whether to use trits for quantization.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--eval', action='store_true', + help='evaluate quantized model.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save quantized checkpoint under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save quantized `.safetensors` checkpoint under this name.' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load quantized model.' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perplexity during benchmarking for verification.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--true-sequential', action='store_true', + help='Whether to run in true sequential model.' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval' + ) + + args = parser.parse_args() + + if type(args.load) is not str: + args.load = args.load.as_posix() + + if args.load: + model = load_quant(args.model, args.load, args.wbits, args.groupsize) + else: + model = get_llama(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if not args.load and args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = llama_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + llama_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + + if args.eval: + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + llama_eval(model, testloader, DEV) + + if args.save: + llama_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + + if args.save_safetensors: + llama_pack(model, quantizers, args.wbits, args.groupsize) + from safetensors.torch import save_file as safe_save + safe_save(model.state_dict(), args.save_safetensors) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9808a343774e28feb7de4bdc61897fcbde76967d --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference.py @@ -0,0 +1,137 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + +from transformers import AutoTokenizer + +DEV = torch.device('cuda:0') + +def get_llama(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import LlamaForCausalLM + model = LlamaForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = 2048 + return model + +def load_quant(model, checkpoint, wbits, groupsize, device): + from transformers import LlamaConfig, LlamaForCausalLM + config = LlamaConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = LlamaForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + if device == -1: + device = "cpu" + model.load_state_dict(safe_load(checkpoint, device)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + print('Done.') + + return model + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='llama model to load' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load quantized model.' + ) + + parser.add_argument( + '--text', type=str, + help='input text' + ) + + parser.add_argument( + '--min_length', type=int, default=10, + help='The minimum length of the sequence to be generated.' + ) + + parser.add_argument( + '--max_length', type=int, default=50, + help='The maximum length of the sequence to be generated.' + ) + + parser.add_argument( + '--top_p', type=float , default=0.95, + help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.' + ) + + parser.add_argument( + '--temperature', type=float, default=0.8, + help='The value used to module the next token probabilities.' + ) + + parser.add_argument( + '--device', type=int, default=-1, + help='The device used to load the model when using safetensors. Default device is "cpu" or specify, 0,1,2,3,... for GPU device.' + ) + + args = parser.parse_args() + + if type(args.load) is not str: + args.load = args.load.as_posix() + + if args.load: + model = load_quant(args.model, args.load, args.wbits, args.groupsize, args.device) + else: + model = get_llama(args.model) + model.eval() + + model.to(DEV) + tokenizer = AutoTokenizer.from_pretrained(args.model) + input_ids = tokenizer.encode(args.text, return_tensors="pt").to(DEV) + + with torch.no_grad(): + generated_ids = model.generate( + input_ids, + do_sample=True, + min_length=args.min_length, + max_length=args.max_length, + top_p=args.top_p, + temperature=args.temperature, + ) + print(tokenizer.decode([el.item() for el in generated_ids[0]])) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference_offload.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference_offload.py new file mode 100644 index 0000000000000000000000000000000000000000..b0df9e128f6346c14fa50a2672ca4a7e5527a732 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/llama_inference_offload.py @@ -0,0 +1,311 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + +from transformers import AutoTokenizer + +DEV = torch.device('cuda:0') +import copy +from transformers.models.llama.modeling_llama import LlamaModel,LlamaConfig +from transformers.modeling_outputs import BaseModelOutputWithPast +from typing import List, Optional, Tuple, Union +import time + +class Offload_LlamaModel(LlamaModel): + def __init__(self, config: LlamaConfig): + super().__init__(config) + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx in range(len(self.layers)): + if idx <= (self.preload - 1): + decoder_layer = self.layers[idx] + else: + decoder_layer = self.layers[idx].to(DEV) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if idx > (self.preload - 1): + self.layers[idx] = decoder_layer.cpu() + del decoder_layer + torch.cuda.empty_cache() + + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + +def load_quant(model, checkpoint, wbits, groupsize, pre_layer): + transformers.models.llama.modeling_llama.LlamaModel = Offload_LlamaModel + from transformers import LlamaConfig, LlamaForCausalLM + config = LlamaConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = LlamaForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + model.seqlen = 2048 + + for i in range(pre_layer): + model.model.layers[i].to(DEV) + model.model.embed_tokens.to(DEV) + model.model.norm.to(DEV) + model.lm_head.to(DEV) + model.model.preload = pre_layer + print('Done.') + return model + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='llama model to load' + ) + parser.add_argument( + '--wbits', type=int, default=4, choices=[2, 3, 4, 8], + help='#bits to use for quantization' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load quantized model.' + ) + parser.add_argument( + '--text', type=str, + help='input text' + ) + + parser.add_argument( + '--min_length', type=int, default=10, + help='The minimum length of the sequence to be generated.' + ) + + parser.add_argument( + '--max_length', type=int, default=50, + help='The maximum length of the sequence to be generated.' + ) + + parser.add_argument( + '--top_p', type=float , default=0.95, + help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.' + ) + + parser.add_argument( + '--temperature', type=float, default=0.8, + help='The value used to module the next token probabilities.' + ) + + parser.add_argument( + '--pre_layer', type=int, default=50, + help='The number of layers to preload' + ) + + args = parser.parse_args() + + if type(args.load) is not str: + args.load = args.load.as_posix() + + model = load_quant(args.model, args.load, args.wbits, args.groupsize, args.pre_layer) + + tokenizer = AutoTokenizer.from_pretrained(args.model) + input_ids = tokenizer.encode(args.text, return_tensors="pt").to(DEV) + + with torch.no_grad(): + generated_ids = model.generate( + input_ids, + do_sample=True, + min_length=args.min_length, + max_length=args.max_length, + top_p=args.top_p, + temperature=args.temperature, + ) + print(tokenizer.decode([el.item() for el in generated_ids[0]])) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/modelutils.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/modelutils.py new file mode 100644 index 0000000000000000000000000000000000000000..0c5d12b16c85aac81394418f41a8f3b4d4ee4de2 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/modelutils.py @@ -0,0 +1,16 @@ +import torch +import torch.nn as nn + + +DEV = torch.device('cuda:0') + + +def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): + if type(module) in layers: + return {name: module} + res = {} + for name1, child in module.named_children(): + res.update(find_layers( + child, layers=layers, name=name + '.' + name1 if name != '' else name1 + )) + return res diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/opt.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/opt.py new file mode 100644 index 0000000000000000000000000000000000000000..dbb59f93fc7225c5efe523ff5549e69b632396e1 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/opt.py @@ -0,0 +1,490 @@ +import time + +import torch +import torch.nn as nn + +from gptq import * +from modelutils import * +from quant import * + + +def get_opt(model): + import torch + def skip(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = skip + torch.nn.init.uniform_ = skip + torch.nn.init.normal_ = skip + from transformers import OPTForCausalLM + model = OPTForCausalLM.from_pretrained(model, torch_dtype='auto') + model.seqlen = model.config.max_position_embeddings + return model + +@torch.no_grad() +def opt_sequential(model, dataloader, dev): + print('Starting ...') + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for batch in dataloader: + try: + model(batch[0].to(dev)) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + print('Ready.') + + quantizers = {} + for i in range(len(layers)): + layer = layers[i].to(dev) + + subset = find_layers(layer) + gptq = {} + for name in subset: + gptq[name] = GPTQ(subset[name]) + gptq[name].quantizer = Quantizer() + gptq[name].quantizer.configure( args.wbits, perchannel=True, sym=args.sym, mse=False, trits=args.trits ) + + def add_batch(name): + def tmp(_, inp, out): + gptq[name].add_batch(inp[0].data, out.data) + return tmp + + handles = [] + for name in subset: + handles.append(subset[name].register_forward_hook(add_batch(name))) + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + for h in handles: + h.remove() + + for name in subset: + print(f'Quantizing {name} in layer {i+1}/{len(layers)}...') + scale,zero,g_idx = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order) + quantizers['model.decoder.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(),scale.cpu(),zero.cpu(),g_idx.cpu()) + gptq[name].free() + + for j in range(args.nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + + layers[i] = layer.cpu() + del layer + del gptq + torch.cuda.empty_cache() + + inps, outs = outs, inps + + model.config.use_cache = use_cache + + return quantizers + +@torch.no_grad() +def opt_eval(model, testenc, dev): + print('Evaluating ...') + + testenc = testenc.input_ids + nsamples = testenc.numel() // model.seqlen + + use_cache = model.config.use_cache + model.config.use_cache = False + layers = model.model.decoder.layers + + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(dev) + layers[0] = layers[0].to(dev) + + dtype = next(iter(model.parameters())).dtype + inps = torch.zeros( + (nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev + ) + cache = {'i': 0, 'attention_mask': None} + + class Catcher(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + def forward(self, inp, **kwargs): + inps[cache['i']] = inp + cache['i'] += 1 + cache['attention_mask'] = kwargs['attention_mask'] + raise ValueError + layers[0] = Catcher(layers[0]) + for i in range(nsamples): + batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) + try: + model(batch) + except ValueError: + pass + layers[0] = layers[0].module + + layers[0] = layers[0].cpu() + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu() + model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu() + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.cpu() + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.cpu() + torch.cuda.empty_cache() + + outs = torch.zeros_like(inps) + attention_mask = cache['attention_mask'] + + for i in range(len(layers)): + print(i) + layer = layers[i].to(dev) + + if args.nearest: + subset = find_layers(layer) + for name in subset: + quantizer = Quantizer() + quantizer.configure( + args.wbits, perchannel=True, sym=args.sym, mse=False + ) + W = subset[name].weight.data + quantizer.find_params(W, weight=True) + subset[name].weight.data = quantize( + W, quantizer.scale, quantizer.zero, quantizer.maxq + ).to(next(iter(layer.parameters())).dtype) + + for j in range(nsamples): + outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0] + layers[i] = layer.cpu() + del layer + torch.cuda.empty_cache() + inps, outs = outs, inps + + if model.model.decoder.final_layer_norm is not None: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev) + if model.model.decoder.project_out is not None: + model.model.decoder.project_out = model.model.decoder.project_out.to(dev) + model.lm_head = model.lm_head.to(dev) + + testenc = testenc.to(dev) + nlls = [] + for i in range(nsamples): + hidden_states = inps[i].unsqueeze(0) + if model.model.decoder.final_layer_norm is not None: + hidden_states = model.model.decoder.final_layer_norm(hidden_states) + if model.model.decoder.project_out is not None: + hidden_states = model.model.decoder.project_out(hidden_states) + lm_logits = model.lm_head(hidden_states) + shift_logits = lm_logits[:, :-1, :].contiguous() + shift_labels = testenc[ + :, (i * model.seqlen):((i + 1) * model.seqlen) + ][:, 1:] + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + neg_log_likelihood = loss.float() * model.seqlen + nlls.append(neg_log_likelihood) + ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) + print(ppl.item()) + + model.config.use_cache = use_cache + +# TODO: perform packing on GPU +def opt_pack(model, quantizers, wbits, groupsize): + layers = find_layers(model) + layers = {n: layers[n] for n in quantizers} + make_quant(model, quantizers, wbits, groupsize) + qlayers = find_layers(model, [QuantLinear]) + print('Packing ...') + for name in qlayers: + print(name) + quantizers[name],scale,zero,g_idx = quantizers[name] + qlayers[name].pack(layers[name], scale, zero, g_idx) + print('Done.') + return model + +def load_quant(model, checkpoint, wbits, groupsize = -1, warmup_autotune = True): + from transformers import OPTConfig, OPTForCausalLM + config = OPTConfig.from_pretrained(model) + def noop(*args, **kwargs): + pass + torch.nn.init.kaiming_uniform_ = noop + torch.nn.init.uniform_ = noop + torch.nn.init.normal_ = noop + + torch.set_default_dtype(torch.half) + transformers.modeling_utils._init_weights = False + torch.set_default_dtype(torch.half) + model = OPTForCausalLM(config) + torch.set_default_dtype(torch.float) + model = model.eval() + layers = find_layers(model) + for name in ['model.decoder.project_out', 'model.decoder.project_in', 'lm_head']: + if name in layers: + del layers[name] + make_quant(model, layers, wbits, groupsize) + + del layers + + print('Loading model ...') + if checkpoint.endswith('.safetensors'): + from safetensors.torch import load_file as safe_load + model.load_state_dict(safe_load(checkpoint)) + else: + model.load_state_dict(torch.load(checkpoint)) + if warmup_autotune: + autotune_warmup(model) + model.seqlen = model.config.max_position_embeddings + print('Done.') + return model + +def opt_multigpu(model, gpus): + model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0]) + model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0]) + if hasattr(model.model.decoder, 'project_in') and model.model.decoder.project_in: + model.model.decoder.project_in = model.model.decoder.project_in.to(gpus[0]) + if hasattr(model.model.decoder, 'project_out') and model.model.decoder.project_out: + model.model.decoder.project_out = model.model.decoder.project_out.to(gpus[-1]) + if hasattr(model.model.decoder, 'final_layer_norm') and model.model.decoder.final_layer_norm: + model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(gpus[-1]) + import copy + model.lm_head = copy.deepcopy(model.lm_head).to(gpus[-1]) + + cache = {'mask': None} + + class MoveModule(nn.Module): + def __init__(self, module): + super().__init__() + self.module = module + self.dev = next(iter(self.module.parameters())).device + def forward(self, *inp, **kwargs): + inp = list(inp) + if inp[0].device != self.dev: + inp[0] = inp[0].to(self.dev) + if cache['mask'] is None or cache['mask'].device != self.dev: + cache['mask'] = kwargs['attention_mask'].to(self.dev) + kwargs['attention_mask'] = cache['mask'] + tmp = self.module(*inp, **kwargs) + return tmp + + layers = model.model.decoder.layers + pergpu = math.ceil(len(layers) / len(gpus)) + for i in range(len(layers)): + layers[i] = MoveModule(layers[i].to(gpus[i // pergpu])) + + model.gpus = gpus + +def benchmark(model, input_ids, check=False): + input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV) + torch.cuda.synchronize() + + cache = {'past': None} + def clear_past(i): + def tmp(layer, inp, out): + if cache['past']: + cache['past'][i] = None + return tmp + for i, layer in enumerate(model.model.decoder.layers): + layer.register_forward_hook(clear_past(i)) + + print('Benchmarking ...') + + if check: + loss = nn.CrossEntropyLoss() + tot = 0. + + def sync(): + if hasattr(model, 'gpus'): + for gpu in model.gpus: + torch.cuda.synchronize(gpu) + else: + torch.cuda.synchronize() + with torch.no_grad(): + attention_mask = torch.ones((1, input_ids.numel()), device=DEV) + times = [] + for i in range(input_ids.numel()): + tick = time.time() + out = model( + input_ids[:, i].reshape(-1), + past_key_values=cache['past'], + attention_mask=attention_mask[:, :(i + 1)].reshape((1, -1)) + ) + sync() + times.append(time.time() - tick) + print(i, times[-1]) + if check and i != input_ids.numel() - 1: + tot += loss(out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)).float() + cache['past'] = list(out.past_key_values) + del out + sync() + import numpy as np + print('Median:', np.median(times)) + if check: + print('PPL:', torch.exp(tot / (input_ids.numel() - 1)).item()) + + +if __name__ == '__main__': + import argparse + from datautils import * + + parser = argparse.ArgumentParser() + + parser.add_argument( + 'model', type=str, + help='OPT model to load; pass `facebook/opt-X`.' + ) + parser.add_argument( + 'dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], + help='Where to extract calibration data from.' + ) + parser.add_argument( + '--seed', + type=int, default=0, help='Seed for sampling the calibration data.' + ) + parser.add_argument( + '--nsamples', type=int, default=128, + help='Number of calibration data samples.' + ) + parser.add_argument( + '--percdamp', type=float, default=.01, + help='Percent of the average Hessian diagonal to use for dampening.' + ) + parser.add_argument( + '--nearest', action='store_true', + help='Whether to run the RTN baseline.' + ) + parser.add_argument( + '--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], + help='#bits to use for quantization; use 16 for evaluating base model.' + ) + parser.add_argument( + '--trits', action='store_true', + help='Whether to use trits for quantization.' + ) + parser.add_argument( + '--groupsize', type=int, default=-1, + help='Groupsize to use for quantization; default uses full row.' + ) + parser.add_argument( + '--eval', action='store_true', + help='evaluate quantized model.' + ) + parser.add_argument( + '--save', type=str, default='', + help='Save quantized checkpoint under this name.' + ) + parser.add_argument( + '--save_safetensors', type=str, default='', + help='Save quantized `.safetensors` checkpoint under this name.' + ) + parser.add_argument( + '--load', type=str, default='', + help='Load quantized model.' + ) + parser.add_argument( + '--benchmark', type=int, default=0, + help='Number of tokens to use for benchmarking.' + ) + parser.add_argument( + '--check', action='store_true', + help='Whether to compute perplexity during benchmarking for verification.' + ) + parser.add_argument( + '--sym', action='store_true', + help='Whether to perform symmetric quantization.' + ) + parser.add_argument( + '--act-order', action='store_true', + help='Whether to apply the activation order GPTQ heuristic' + ) + parser.add_argument( + '--new-eval', action='store_true', + help='Whether to use the new PTB and C4 eval' + ) + + args = parser.parse_args() + + if type(args.load) is not str: + args.load = args.load.as_posix() + + if args.load: + model = load_quant(args.model, args.load, args.wbits, args.groupsize) + else: + model = get_opt(args.model) + model.eval() + + dataloader, testloader = get_loaders( + args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + + if not args.load and args.wbits < 16 and not args.nearest: + tick = time.time() + quantizers = opt_sequential(model, dataloader, DEV) + print(time.time() - tick) + + if args.benchmark: + gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())] + if len(gpus) > 1: + opt_multigpu(model, gpus) + else: + model = model.to(DEV) + if args.benchmark: + input_ids = next(iter(dataloader))[0][:, :args.benchmark] + benchmark(model, input_ids, check=args.check) + + if args.eval: + datasets = ['wikitext2', 'ptb', 'c4'] + if args.new_eval: + datasets = ['wikitext2', 'ptb-new', 'c4-new'] + for dataset in datasets: + dataloader, testloader = get_loaders( + dataset, seed=args.seed, model=args.model, seqlen=model.seqlen + ) + print(dataset) + opt_eval(model, testloader, DEV) + + if args.save: + opt_pack(model, quantizers, args.wbits, args.groupsize) + torch.save(model.state_dict(), args.save) + + if args.save_safetensors: + opt_pack(model, quantizers, args.wbits, args.groupsize) + from safetensors.torch import save_file as safe_save + safe_save(model.state_dict(), args.save_safetensors) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/quant.py b/text-generation-webui/repositories/GPTQ-for-LLaMa/quant.py new file mode 100644 index 0000000000000000000000000000000000000000..2f04b3cc8b122bd47a4757a0c854a31c5cb58f99 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/quant.py @@ -0,0 +1,496 @@ +import numpy as np +import torch +import torch.nn as nn +from torch.cuda.amp import custom_bwd, custom_fwd +import math + +def quantize(x, scale, zero, maxq): + if maxq < 0: + return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero + q = torch.clamp(torch.round(x / scale) + zero, 0, maxq) + return scale * (q - zero) + +class Quantizer(nn.Module): + + def __init__(self, shape=1): + super(Quantizer, self).__init__() + self.register_buffer('maxq', torch.tensor(0)) + self.register_buffer('scale', torch.zeros(shape)) + self.register_buffer('zero', torch.zeros(shape)) + + def configure( + self, + bits, perchannel=False, sym=True, + mse=False, norm=2.4, grid=100, maxshrink=.8, + trits=False + ): + + self.maxq = torch.tensor(2 ** bits - 1) + self.perchannel = perchannel + self.sym = sym + self.mse = mse + self.norm = norm + self.grid = grid + self.maxshrink = maxshrink + if trits: + self.maxq = torch.tensor(-1) + + def find_params(self, x, weight=False): + dev = x.device + self.maxq = self.maxq.to(dev) + + shape = x.shape + if self.perchannel: + if weight: + x = x.flatten(1) + else: + if len(shape) == 4: + x = x.permute([1, 0, 2, 3]) + x = x.flatten(1) + if len(shape) == 3: + x = x.reshape((-1, shape[-1])).t() + if len(shape) == 2: + x = x.t() + else: + x = x.flatten().unsqueeze(0) + + tmp = torch.zeros(x.shape[0], device=dev) + xmin = torch.minimum(x.min(1)[0], tmp) + xmax = torch.maximum(x.max(1)[0], tmp) + + if self.sym: + xmax = torch.maximum(torch.abs(xmin), xmax) + tmp = xmin < 0 + if torch.any(tmp): + xmin[tmp] = -xmax[tmp] + tmp = (xmin == 0) & (xmax == 0) + xmin[tmp] = -1 + xmax[tmp] = +1 + + if self.maxq < 0: + self.scale = xmax + self.zero = xmin + else: + self.scale = (xmax - xmin) / self.maxq + if self.sym: + self.zero = torch.full_like(self.scale, (self.maxq + 1) / 2) + else: + self.zero = torch.round(-xmin / self.scale) + + if self.mse: + best = torch.full([x.shape[0]], float('inf'), device=dev) + for i in range(int(self.maxshrink * self.grid)): + p = 1 - i / self.grid + xmin1 = p * xmin + xmax1 = p * xmax + scale1 = (xmax1 - xmin1) / self.maxq + zero1 = torch.round(-xmin1 / scale1) if not self.sym else self.zero + q = quantize(x, scale1.unsqueeze(1), zero1.unsqueeze(1), self.maxq) + q -= x + q.abs_() + q.pow_(self.norm) + err = torch.sum(q, 1) + tmp = err < best + if torch.any(tmp): + best[tmp] = err[tmp] + self.scale[tmp] = scale1[tmp] + self.zero[tmp] = zero1[tmp] + if not self.perchannel: + if weight: + tmp = shape[0] + else: + tmp = shape[1] if len(shape) != 3 else shape[2] + self.scale = self.scale.repeat(tmp) + self.zero = self.zero.repeat(tmp) + + if weight: + shape = [-1] + [1] * (len(shape) - 1) + self.scale = self.scale.reshape(shape) + self.zero = self.zero.reshape(shape) + return + if len(shape) == 4: + self.scale = self.scale.reshape((1, -1, 1, 1)) + self.zero = self.zero.reshape((1, -1, 1, 1)) + if len(shape) == 3: + self.scale = self.scale.reshape((1, 1, -1)) + self.zero = self.zero.reshape((1, 1, -1)) + if len(shape) == 2: + self.scale = self.scale.unsqueeze(0) + self.zero = self.zero.unsqueeze(0) + + def quantize(self, x): + if self.ready(): + return quantize(x, self.scale, self.zero, self.maxq) + return x + + def enabled(self): + return self.maxq > 0 + + def ready(self): + return torch.all(self.scale != 0) + +try: + import triton + import triton.language as tl + import custom_autotune + + # code based https://github.com/fpgaminer/GPTQ-triton + @custom_autotune.autotune( + configs=[ + triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + # These provided a benefit on a 3090 + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + ], + key=['M', 'N'], + nearest_power_of_two=True, + ) + + @triton.jit + def matmul_248_kernel(a_ptr, b_ptr, c_ptr, + scales_ptr, zeros_ptr, g_ptr, + M, N, K, bits, maxq, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + stride_scales, stride_zeros, + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, K) float16 + B is of shape (K//8, N) int32 + C is of shape (M, N) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + a_mask = (offs_am[:, None] < M) + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_k + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_bn[None, :] + zeros_ptrs = zeros_ptr + (offs_bn[None, :]// infearure_per_bits) + + shifter = (offs_k % infearure_per_bits) * bits + zeros_shifter = (offs_bn % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + + for k in range(0, num_pid_k): + g_idx = tl.load(g_ptrs) + + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + zeros = (zeros + 1) + + a = tl.load(a_ptrs, mask=a_mask, other=0.) # (BLOCK_SIZE_M, BLOCK_SIZE_K) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_K + b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk + g_ptrs += BLOCK_SIZE_K + + c = accumulator.to(tl.float16) + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + # code based https://github.com/fpgaminer/GPTQ-triton + @custom_autotune.autotune( + configs=[ + triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 256, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 128, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + # These provided a benefit on a 3090 + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_K': 32, 'BLOCK_SIZE_N': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_K': 64, 'BLOCK_SIZE_N': 128, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4), + ], + key=['M', 'K'], + nearest_power_of_two=True, + ) + + @triton.jit + def trans_matmul_248_kernel(a_ptr, b_ptr, c_ptr, + scales_ptr, zeros_ptr, g_ptr, + M, N, K, bits, maxq, + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + stride_scales, stride_zeros, + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr): + """ + Compute the matrix multiplication C = A x B. + A is of shape (M, N) float16 + B is of shape (K//8, N) int32 + C is of shape (M, K) float16 + scales is of shape (G, N) float16 + zeros is of shape (G, N) float16 + g_ptr is of shape (K) int32 + """ + infearure_per_bits = 32 // bits + + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_k = tl.cdiv(K, BLOCK_SIZE_K) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_k + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_k = (pid % num_pid_in_group) // group_size_m + + offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_bk = pid_k * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K) + offs_n = tl.arange(0, BLOCK_SIZE_N) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_n[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_N) + a_mask = (offs_am[:, None] < M) + # b_ptrs is set up such that it repeats elements along the K axis 8 times + b_ptrs = b_ptr + ((offs_bk[:, None] // infearure_per_bits) * stride_bk + offs_n[None, :] * stride_bn) # (BLOCK_SIZE_K, BLOCK_SIZE_N) + g_ptrs = g_ptr + offs_bk + g_idx = tl.load(g_ptrs) + + # shifter is used to extract the N bits of each element in the 32-bit word from B + scales_ptrs = scales_ptr + offs_n[None, :] + g_idx[:, None] * stride_scales + zeros_ptrs = zeros_ptr + (offs_n[None, :]// infearure_per_bits) + g_idx[:, None] * stride_zeros + + shifter = (offs_bk % infearure_per_bits) * bits + zeros_shifter = (offs_n % infearure_per_bits) * bits + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_K), dtype=tl.float32) + + for k in range(0, num_pid_n): + # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop + scales = tl.load(scales_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + zeros = tl.load(zeros_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N,) + + zeros = (zeros >> zeros_shifter[None, :]) & maxq + zeros = (zeros + 1) + + a = tl.load(a_ptrs, mask=a_mask, other=0.) # (BLOCK_SIZE_M, BLOCK_SIZE_N) + b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated + + # Now we need to unpack b (which is N-bit values) into 32-bit values + b = (b >> shifter[:, None]) & maxq # Extract the N-bit values + b = (b - zeros) * scales # Scale and shift + b = tl.trans(b) + + accumulator += tl.dot(a, b) + a_ptrs += BLOCK_SIZE_N + b_ptrs += BLOCK_SIZE_N + scales_ptrs += BLOCK_SIZE_N + zeros_ptrs += (BLOCK_SIZE_N // infearure_per_bits) + + c = accumulator.to(tl.float16) + c_ptrs = c_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bk[None, :] + c_mask = (offs_am[:, None] < M) & (offs_bk[None, :] < K) + tl.store(c_ptrs, accumulator, mask=c_mask) +except: + print('trioton not installed.') + +def matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq): + output = torch.empty((input.shape[0], qweight.shape[1]), device='cuda', dtype=torch.float16) + grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(qweight.shape[1], META['BLOCK_SIZE_N']),) + matmul_248_kernel[grid](input, qweight, output, + scales, qzeros, g_idx, + input.shape[0], qweight.shape[1], input.shape[1], bits, maxq, + input.stride(0), input.stride(1), + qweight.stride(0), qweight.stride(1), + output.stride(0), output.stride(1), + scales.stride(0), qzeros.stride(0)) + return output + +def transpose_matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq): + output_dim = (qweight.shape[0] * 32) // bits + output = torch.empty((input.shape[0], output_dim), device='cuda', dtype=torch.float16) + grid = lambda META: (triton.cdiv(input.shape[0], META['BLOCK_SIZE_M']) * triton.cdiv(output_dim, META['BLOCK_SIZE_K']),) + transpose_matmul_248_kernel[grid](input, qweight, output, + scales, qzeros, g_idx, + input.shape[0], qweight.shape[1], output_dim, bits, maxq, + input.stride(0), input.stride(1), + qweight.stride(0), qweight.stride(1), + output.stride(0), output.stride(1), + scales.stride(0), qzeros.stride(0)) + return output + +class QuantLinearFunction(torch.autograd.Function): + @staticmethod + @custom_fwd(cast_inputs=torch.float16) + def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq): + output = matmul248(input, qweight, scales, qzeros, g_idx, bits, maxq) + ctx.save_for_backward(qweight, scales, qzeros, g_idx) + ctx.bits,ctx.maxq = bits, maxq + return output + + @staticmethod + @custom_bwd + def backward(ctx, grad_output): + qweight, scales, qzeros, g_idx = ctx.saved_tensors + bits, maxq = ctx.bits, ctx.maxq + grad_input = None + + if ctx.needs_input_grad[0]: + grad_input = transpose_matmul248(grad_output, qweight, scales, qzeros, g_idx, bits, maxq) + return grad_input, None, None, None, None, None, None + +class QuantLinear(nn.Module): + def __init__(self, bits, groupsize, infeatures, outfeatures, bias): + super().__init__() + if bits not in [2,4,8]: + raise NotImplementedError("Only 2,4,8 bits are supported.") + self.infeatures = infeatures + self.outfeatures = outfeatures + self.bits = bits + self.maxq = 2 ** self.bits - 1 + self.groupsize = groupsize if groupsize != -1 else infeatures + + self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32)) + self.register_buffer('qzeros', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32)) + self.register_buffer('scales', torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16)) + self.register_buffer('g_idx', torch.tensor([i // self.groupsize for i in range(infeatures)], dtype = torch.int32)) + if bias: + self.register_buffer('bias', torch.zeros((outfeatures),dtype=torch.float16)) + else: + self.bias = None + + def pack(self, linear, scales, zeros, g_idx = None): + self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + intweight = [] + for idx in range(self.infeatures): + intweight.append(torch.round((linear.weight.data[:,idx] + scale_zeros[self.g_idx[idx]]) / self.scales[self.g_idx[idx]]).to(torch.int)[:,None]) + intweight = torch.cat(intweight,dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros((intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32) + i = 0 + row = 0 + while row < qweight.shape[0]: + if self.bits in [2,4,8]: + for j in range(i, i + (32//self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += 32//self.bits + row += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + zeros -= 1; + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [2,4,8]: + for j in range(i, i + (32//self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += 32//self.bits + col += 1 + else: + raise NotImplementedError("Only 2,4,8 bits are supported.") + + qzeros = qzeros.astype(np.int32) + self.qzeros = torch.from_numpy(qzeros) + + def forward(self, x): + out_shape = x.shape[:-1] + (self.outfeatures, ) + out = QuantLinearFunction.apply(x.reshape(-1,x.shape[-1]), self.qweight, self.scales, + self.qzeros, self.g_idx, self.bits, self.maxq) + out = out + self.bias if self.bias is not None else out + return out.reshape(out_shape) + +def autotune_warmup(model, transpose = False): + """ + Pre-tunes the quantized kernel + """ + from tqdm import tqdm + + n_values = {} + + for _, m in model.named_modules(): + if not isinstance(m, QuantLinear): + continue + + k = m.infeatures + n = m.outfeatures + + if n not in n_values: + n_values[n] = (k, m.qweight.cuda(), m.scales.cuda(), m.qzeros.cuda(), m.g_idx.cuda(), m.bits, m.maxq) + + print(f'Found {len(n_values)} unique N values.') + + print('Warming up autotune cache ...') + for m in tqdm(range(0, 12)): + m = 2 ** m # [1, 2048] + for n, (k, qweight, scales, qzeros, g_idx, bits, maxq) in n_values.items(): + a = torch.randn(m, k, dtype=torch.float16, device='cuda') + matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq) + if transpose: + a = torch.randn(m, n, dtype=torch.float16, device='cuda') + transpose_matmul248(a, qweight, scales, qzeros, g_idx, bits, maxq) + del n_values + +def make_quant(module, names, bits, groupsize, name=''): + if isinstance(module, QuantLinear): + return + for attr in dir(module): + tmp = getattr(module, attr) + name1 = name + '.' + attr if name != '' else attr + if name1 in names: + delattr(module, attr) + setattr(module, attr, QuantLinear(bits, groupsize, tmp.in_features, tmp.out_features, tmp.bias is not None)) + for name1, child in module.named_children(): + make_quant(child, names, bits, groupsize, name + '.' + name1 if name != '' else name1) diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl b/text-generation-webui/repositories/GPTQ-for-LLaMa/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl new file mode 100644 index 0000000000000000000000000000000000000000..a414ae191d8d71b5e488419530ad20a6dc50ff8c Binary files /dev/null and b/text-generation-webui/repositories/GPTQ-for-LLaMa/quant_cuda-0.0.0-cp310-cp310-win_amd64.whl differ diff --git a/text-generation-webui/repositories/GPTQ-for-LLaMa/requirements.txt b/text-generation-webui/repositories/GPTQ-for-LLaMa/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e6005fd10fed87d3182cee07056dc0bda7ef538 --- /dev/null +++ b/text-generation-webui/repositories/GPTQ-for-LLaMa/requirements.txt @@ -0,0 +1,6 @@ +safetensors==0.3.0 +datasets==2.10.1 +sentencepiece +git+https://github.com/huggingface/transformers +accelerate==0.17.1 +triton==2.0.0 diff --git a/text-generation-webui/requirements.txt b/text-generation-webui/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa1a38d65d788367de855a30c5e76b93468b86c4 --- /dev/null +++ b/text-generation-webui/requirements.txt @@ -0,0 +1,15 @@ +accelerate==0.18.0 +bitsandbytes==0.37.2 +datasets +flexgen==0.1.7 +gradio==3.24.1 +markdown +numpy +peft==0.2.0 +requests +rwkv==0.7.3 +safetensors==0.3.0 +sentencepiece +pyyaml +tqdm +git+https://github.com/huggingface/transformers diff --git a/text-generation-webui/server.py b/text-generation-webui/server.py new file mode 100644 index 0000000000000000000000000000000000000000..4f2324f50ddd516ed1743bbfb16c06158f661f5c --- /dev/null +++ b/text-generation-webui/server.py @@ -0,0 +1,591 @@ +import os + +os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False' + +import io +import json +import re +import sys +import time +import zipfile +from datetime import datetime +from pathlib import Path + +import gradio as gr +from PIL import Image + +import modules.extensions as extensions_module +from modules import api, chat, shared, training, ui +from modules.html_generator import chat_html_wrapper +from modules.LoRA import add_lora_to_model +from modules.models import load_model, load_soft_prompt +from modules.text_generation import (clear_torch_cache, generate_reply, + stop_everything_event) + +# Loading custom settings +settings_file = None +if shared.args.settings is not None and Path(shared.args.settings).exists(): + settings_file = Path(shared.args.settings) +elif Path('settings.json').exists(): + settings_file = Path('settings.json') +if settings_file is not None: + print(f"Loading settings from {settings_file}...") + new_settings = json.loads(open(settings_file, 'r').read()) + for item in new_settings: + shared.settings[item] = new_settings[item] + + +def get_available_models(): + if shared.args.flexgen: + return sorted([re.sub('-np$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if item.name.endswith('-np')], key=str.lower) + else: + return sorted([re.sub('.pth$', '', item.name) for item in list(Path(f'{shared.args.model_dir}/').glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=str.lower) + + +def get_available_presets(): + return sorted(set((k.stem for k in Path('presets').glob('*.txt'))), key=str.lower) + + +def get_available_prompts(): + prompts = [] + prompts += sorted(set((k.stem for k in Path('prompts').glob('[0-9]*.txt'))), key=str.lower, reverse=True) + prompts += sorted(set((k.stem for k in Path('prompts').glob('*.txt'))), key=str.lower) + prompts += ['None'] + return prompts + + +def get_available_characters(): + paths = (x for x in Path('characters').iterdir() if x.suffix in ('.json', '.yaml', '.yml')) + return ['None'] + sorted(set((k.stem for k in paths if k.stem != "instruction-following")), key=str.lower) + + +def get_available_instruction_templates(): + path = "characters/instruction-following" + paths = [] + if os.path.exists(path): + paths = (x for x in Path(path).iterdir() if x.suffix in ('.json', '.yaml', '.yml')) + return ['None'] + sorted(set((k.stem for k in paths)), key=str.lower) + + +def get_available_extensions(): + return sorted(set(map(lambda x: x.parts[1], Path('extensions').glob('*/script.py'))), key=str.lower) + + +def get_available_softprompts(): + return ['None'] + sorted(set((k.stem for k in Path('softprompts').glob('*.zip'))), key=str.lower) + + +def get_available_loras(): + return ['None'] + sorted([item.name for item in list(Path(shared.args.lora_dir).glob('*')) if not item.name.endswith(('.txt', '-np', '.pt', '.json'))], key=str.lower) + + +def unload_model(): + shared.model = shared.tokenizer = None + clear_torch_cache() + + +def load_model_wrapper(selected_model): + if selected_model != shared.model_name: + shared.model_name = selected_model + + unload_model() + if selected_model != '': + shared.model, shared.tokenizer = load_model(shared.model_name) + + return selected_model + + +def load_lora_wrapper(selected_lora): + add_lora_to_model(selected_lora) + return selected_lora + + +def load_preset_values(preset_menu, state, return_dict=False): + generate_params = { + 'do_sample': True, + 'temperature': 1, + 'top_p': 1, + 'typical_p': 1, + 'repetition_penalty': 1, + 'encoder_repetition_penalty': 1, + 'top_k': 50, + 'num_beams': 1, + 'penalty_alpha': 0, + 'min_length': 0, + 'length_penalty': 1, + 'no_repeat_ngram_size': 0, + 'early_stopping': False, + } + with open(Path(f'presets/{preset_menu}.txt'), 'r') as infile: + preset = infile.read() + for i in preset.splitlines(): + i = i.rstrip(',').strip().split('=') + if len(i) == 2 and i[0].strip() != 'tokens': + generate_params[i[0].strip()] = eval(i[1].strip()) + generate_params['temperature'] = min(1.99, generate_params['temperature']) + + if return_dict: + return generate_params + else: + state.update(generate_params) + return state, *[generate_params[k] for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']] + + +def upload_soft_prompt(file): + with zipfile.ZipFile(io.BytesIO(file)) as zf: + zf.extract('meta.json') + j = json.loads(open('meta.json', 'r').read()) + name = j['name'] + Path('meta.json').unlink() + + with open(Path(f'softprompts/{name}.zip'), 'wb') as f: + f.write(file) + + return name + + +def save_prompt(text): + fname = f"{datetime.now().strftime('%Y-%m-%d-%H%M%S')}.txt" + with open(Path(f'prompts/{fname}'), 'w', encoding='utf-8') as f: + f.write(text) + return f"Saved to prompts/{fname}" + + +def load_prompt(fname): + if fname in ['None', '']: + return '' + else: + with open(Path(f'prompts/{fname}.txt'), 'r', encoding='utf-8') as f: + text = f.read() + if text[-1] == '\n': + text = text[:-1] + return text + + +def create_prompt_menus(): + with gr.Row(): + with gr.Column(): + with gr.Row(): + shared.gradio['prompt_menu'] = gr.Dropdown(choices=get_available_prompts(), value='None', label='Prompt') + ui.create_refresh_button(shared.gradio['prompt_menu'], lambda: None, lambda: {'choices': get_available_prompts()}, 'refresh-button') + + with gr.Column(): + with gr.Column(): + shared.gradio['save_prompt'] = gr.Button('Save prompt') + shared.gradio['status'] = gr.Markdown('Ready') + + shared.gradio['prompt_menu'].change(load_prompt, [shared.gradio['prompt_menu']], [shared.gradio['textbox']], show_progress=False) + shared.gradio['save_prompt'].click(save_prompt, [shared.gradio['textbox']], [shared.gradio['status']], show_progress=False) + + +def create_model_menus(): + with gr.Row(): + with gr.Column(): + with gr.Row(): + shared.gradio['model_menu'] = gr.Dropdown(choices=available_models, value=shared.model_name, label='Model') + ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': get_available_models()}, 'refresh-button') + with gr.Column(): + with gr.Row(): + shared.gradio['lora_menu'] = gr.Dropdown(choices=available_loras, value=shared.lora_name, label='LoRA') + ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': get_available_loras()}, 'refresh-button') + + shared.gradio['model_menu'].change(load_model_wrapper, shared.gradio['model_menu'], shared.gradio['model_menu'], show_progress=True) + shared.gradio['lora_menu'].change(load_lora_wrapper, shared.gradio['lora_menu'], shared.gradio['lora_menu'], show_progress=True) + + +def create_settings_menus(default_preset): + generate_params = load_preset_values(default_preset if not shared.args.flexgen else 'Naive', {}, return_dict=True) + for k in ['max_new_tokens', 'seed', 'stop_at_newline', 'chat_prompt_size', 'chat_generation_attempts']: + generate_params[k] = shared.settings[k] + shared.gradio['generate_state'] = gr.State(generate_params) + + with gr.Row(): + with gr.Column(): + with gr.Row(): + shared.gradio['preset_menu'] = gr.Dropdown(choices=available_presets, value=default_preset if not shared.args.flexgen else 'Naive', label='Generation parameters preset') + ui.create_refresh_button(shared.gradio['preset_menu'], lambda: None, lambda: {'choices': get_available_presets()}, 'refresh-button') + with gr.Column(): + shared.gradio['seed'] = gr.Number(value=shared.settings['seed'], label='Seed (-1 for random)') + + with gr.Row(): + with gr.Column(): + with gr.Box(): + gr.Markdown('Custom generation parameters ([reference](https://huggingface.co./docs/transformers/main_classes/text_generation#transformers.GenerationConfig))') + with gr.Row(): + with gr.Column(): + shared.gradio['temperature'] = gr.Slider(0.01, 1.99, value=generate_params['temperature'], step=0.01, label='temperature') + shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p') + shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k') + shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p') + with gr.Column(): + shared.gradio['repetition_penalty'] = gr.Slider(1.0, 1.5, value=generate_params['repetition_penalty'], step=0.01, label='repetition_penalty') + shared.gradio['encoder_repetition_penalty'] = gr.Slider(0.8, 1.5, value=generate_params['encoder_repetition_penalty'], step=0.01, label='encoder_repetition_penalty') + shared.gradio['no_repeat_ngram_size'] = gr.Slider(0, 20, step=1, value=generate_params['no_repeat_ngram_size'], label='no_repeat_ngram_size') + shared.gradio['min_length'] = gr.Slider(0, 2000, step=1, value=generate_params['min_length'] if shared.args.no_stream else 0, label='min_length', interactive=shared.args.no_stream) + shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') + with gr.Column(): + with gr.Box(): + gr.Markdown('Contrastive search') + shared.gradio['penalty_alpha'] = gr.Slider(0, 5, value=generate_params['penalty_alpha'], label='penalty_alpha') + with gr.Box(): + gr.Markdown('Beam search (uses a lot of VRAM)') + with gr.Row(): + with gr.Column(): + shared.gradio['num_beams'] = gr.Slider(1, 20, step=1, value=generate_params['num_beams'], label='num_beams') + with gr.Column(): + shared.gradio['length_penalty'] = gr.Slider(-5, 5, value=generate_params['length_penalty'], label='length_penalty') + shared.gradio['early_stopping'] = gr.Checkbox(value=generate_params['early_stopping'], label='early_stopping') + + with gr.Accordion('Soft prompt', open=False): + with gr.Row(): + shared.gradio['softprompts_menu'] = gr.Dropdown(choices=available_softprompts, value='None', label='Soft prompt') + ui.create_refresh_button(shared.gradio['softprompts_menu'], lambda: None, lambda: {'choices': get_available_softprompts()}, 'refresh-button') + + gr.Markdown('Upload a soft prompt (.zip format):') + with gr.Row(): + shared.gradio['upload_softprompt'] = gr.File(type='binary', file_types=['.zip']) + + shared.gradio['preset_menu'].change(load_preset_values, [shared.gradio[k] for k in ['preset_menu', 'generate_state']], [shared.gradio[k] for k in ['generate_state', 'do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping']]) + shared.gradio['softprompts_menu'].change(load_soft_prompt, shared.gradio['softprompts_menu'], shared.gradio['softprompts_menu'], show_progress=True) + shared.gradio['upload_softprompt'].upload(upload_soft_prompt, shared.gradio['upload_softprompt'], shared.gradio['softprompts_menu']) + + +def set_interface_arguments(interface_mode, extensions, bool_active): + modes = ["default", "notebook", "chat", "cai_chat"] + cmd_list = vars(shared.args) + bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes] + + shared.args.extensions = extensions + for k in modes[1:]: + exec(f"shared.args.{k} = False") + if interface_mode != "default": + exec(f"shared.args.{interface_mode} = True") + + for k in bool_list: + exec(f"shared.args.{k} = False") + for k in bool_active: + exec(f"shared.args.{k} = True") + + shared.need_restart = True + + +available_models = get_available_models() +available_presets = get_available_presets() +available_characters = get_available_characters() +available_softprompts = get_available_softprompts() +available_loras = get_available_loras() + +# Default extensions +extensions_module.available_extensions = get_available_extensions() +if shared.is_chat(): + for extension in shared.settings['chat_default_extensions']: + shared.args.extensions = shared.args.extensions or [] + if extension not in shared.args.extensions: + shared.args.extensions.append(extension) +else: + for extension in shared.settings['default_extensions']: + shared.args.extensions = shared.args.extensions or [] + if extension not in shared.args.extensions: + shared.args.extensions.append(extension) + +# Default model +if shared.args.model is not None: + shared.model_name = shared.args.model +else: + if len(available_models) == 0: + print('No models are available! Please download at least one.') + sys.exit(0) + elif len(available_models) == 1: + i = 0 + else: + print('The following models are available:\n') + for i, model in enumerate(available_models): + print(f'{i+1}. {model}') + print(f'\nWhich one do you want to load? 1-{len(available_models)}\n') + i = int(input()) - 1 + print() + shared.model_name = available_models[i] +shared.model, shared.tokenizer = load_model(shared.model_name) +if shared.args.lora: + add_lora_to_model(shared.args.lora) + +# Default UI settings +default_preset = shared.settings['presets'][next((k for k in shared.settings['presets'] if re.match(k.lower(), shared.model_name.lower())), 'default')] +if shared.lora_name != "None": + default_text = load_prompt(shared.settings['lora_prompts'][next((k for k in shared.settings['lora_prompts'] if re.match(k.lower(), shared.lora_name.lower())), 'default')]) +else: + default_text = load_prompt(shared.settings['prompts'][next((k for k in shared.settings['prompts'] if re.match(k.lower(), shared.model_name.lower())), 'default')]) +title = 'Text generation web UI' + + +def create_interface(): + gen_events = [] + if shared.args.extensions is not None and len(shared.args.extensions) > 0: + extensions_module.load_extensions() + + with gr.Blocks(css=ui.css if not shared.is_chat() else ui.css + ui.chat_css, analytics_enabled=False, title=title) as shared.gradio['interface']: + if shared.is_chat(): + shared.gradio['Chat input'] = gr.State() + with gr.Tab("Text generation", elem_id="main"): + shared.gradio['display'] = gr.HTML(value=chat_html_wrapper(shared.history['visible'], shared.settings['name1'], shared.settings['name2'], 'cai-chat')) + shared.gradio['textbox'] = gr.Textbox(label='Input') + with gr.Row(): + shared.gradio['Generate'] = gr.Button('Generate') + shared.gradio['Stop'] = gr.Button('Stop', elem_id="stop") + with gr.Row(): + shared.gradio['Impersonate'] = gr.Button('Impersonate') + shared.gradio['Regenerate'] = gr.Button('Regenerate') + with gr.Row(): + shared.gradio['Copy last reply'] = gr.Button('Copy last reply') + shared.gradio['Replace last reply'] = gr.Button('Replace last reply') + shared.gradio['Remove last'] = gr.Button('Remove last') + + shared.gradio['Clear history'] = gr.Button('Clear history') + shared.gradio['Clear history-confirm'] = gr.Button('Confirm', variant="stop", visible=False) + shared.gradio['Clear history-cancel'] = gr.Button('Cancel', visible=False) + + shared.gradio["Chat mode"] = gr.Radio(choices=["cai-chat", "chat", "instruct"], value="cai-chat", label="Mode") + shared.gradio["Instruction templates"] = gr.Dropdown(choices=get_available_instruction_templates(), label="Instruction template", value="None", visible=False) + + with gr.Tab("Character", elem_id="chat-settings"): + with gr.Row(): + with gr.Column(scale=8): + shared.gradio['name1'] = gr.Textbox(value=shared.settings['name1'], lines=1, label='Your name') + shared.gradio['name2'] = gr.Textbox(value=shared.settings['name2'], lines=1, label='Character\'s name') + shared.gradio['greeting'] = gr.Textbox(value=shared.settings['greeting'], lines=4, label='Greeting') + shared.gradio['context'] = gr.Textbox(value=shared.settings['context'], lines=4, label='Context') + shared.gradio['end_of_turn'] = gr.Textbox(value=shared.settings["end_of_turn"], lines=1, label='End of turn string') + with gr.Column(scale=1): + shared.gradio['character_picture'] = gr.Image(label='Character picture', type="pil") + shared.gradio['your_picture'] = gr.Image(label='Your picture', type="pil", value=Image.open(Path("cache/pfp_me.png")) if Path("cache/pfp_me.png").exists() else None) + with gr.Row(): + shared.gradio['character_menu'] = gr.Dropdown(choices=available_characters, value='None', label='Character', elem_id='character-menu') + ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': get_available_characters()}, 'refresh-button') + + with gr.Row(): + with gr.Tab('Chat history'): + with gr.Row(): + with gr.Column(): + gr.Markdown('Upload') + shared.gradio['upload_chat_history'] = gr.File(type='binary', file_types=['.json', '.txt']) + with gr.Column(): + gr.Markdown('Download') + shared.gradio['download'] = gr.File() + shared.gradio['download_button'] = gr.Button(value='Click me') + with gr.Tab('Upload character'): + gr.Markdown("# JSON format") + with gr.Row(): + with gr.Column(): + gr.Markdown('1. Select the JSON file') + shared.gradio['upload_json'] = gr.File(type='binary', file_types=['.json']) + with gr.Column(): + gr.Markdown('2. Select your character\'s profile picture (optional)') + shared.gradio['upload_img_bot'] = gr.File(type='binary', file_types=['image']) + shared.gradio['Upload character'] = gr.Button(value='Submit') + + gr.Markdown("# TavernAI PNG format") + shared.gradio['upload_img_tavern'] = gr.File(type='binary', file_types=['image']) + + with gr.Tab("Parameters", elem_id="parameters"): + with gr.Box(): + gr.Markdown("Chat parameters") + with gr.Row(): + with gr.Column(): + shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) + shared.gradio['chat_prompt_size_slider'] = gr.Slider(minimum=shared.settings['chat_prompt_size_min'], maximum=shared.settings['chat_prompt_size_max'], step=1, label='Maximum prompt size in tokens', value=shared.settings['chat_prompt_size']) + with gr.Column(): + shared.gradio['chat_generation_attempts'] = gr.Slider(minimum=shared.settings['chat_generation_attempts_min'], maximum=shared.settings['chat_generation_attempts_max'], value=shared.settings['chat_generation_attempts'], step=1, label='Generation attempts (for longer replies)') + shared.gradio['stop_at_newline'] = gr.Checkbox(value=shared.settings['stop_at_newline'], label='Stop generating at new line character?') + + create_settings_menus(default_preset) + + shared.input_params = [shared.gradio[k] for k in ['Chat input', 'generate_state', 'name1', 'name2', 'context', 'Chat mode', 'end_of_turn']] + + def set_chat_input(textbox): + return textbox, "" + + gen_events.append(shared.gradio['Generate'].click(set_chat_input, shared.gradio['textbox'], [shared.gradio['Chat input'], shared.gradio['textbox']], show_progress=False)) + gen_events.append(shared.gradio['Generate'].click(chat.cai_chatbot_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['textbox'].submit(set_chat_input, shared.gradio['textbox'], [shared.gradio['Chat input'], shared.gradio['textbox']], show_progress=False)) + gen_events.append(shared.gradio['textbox'].submit(chat.cai_chatbot_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['Regenerate'].click(chat.regenerate_wrapper, shared.input_params, shared.gradio['display'], show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['Impersonate'].click(chat.impersonate_wrapper, shared.input_params, shared.gradio['textbox'], show_progress=shared.args.no_stream)) + shared.gradio['Stop'].click(stop_everything_event, [], [], queue=False, cancels=gen_events if shared.args.no_stream else None) + + shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, [], shared.gradio['textbox'], show_progress=shared.args.no_stream) + shared.gradio['Replace last reply'].click(chat.replace_last_reply, [shared.gradio[k] for k in ['textbox', 'name1', 'name2', 'Chat mode']], shared.gradio['display'], show_progress=shared.args.no_stream) + + # Clear history with confirmation + clear_arr = [shared.gradio[k] for k in ['Clear history-confirm', 'Clear history', 'Clear history-cancel']] + shared.gradio['Clear history'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, clear_arr) + shared.gradio['Clear history-confirm'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr) + shared.gradio['Clear history-confirm'].click(chat.clear_chat_log, [shared.gradio[k] for k in ['name1', 'name2', 'greeting', 'Chat mode']], shared.gradio['display']) + shared.gradio['Clear history-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, clear_arr) + shared.gradio['Chat mode'].change(lambda x: gr.update(visible=x == 'instruct'), shared.gradio['Chat mode'], shared.gradio['Instruction templates']) + + shared.gradio['Remove last'].click(chat.remove_last_message, [shared.gradio[k] for k in ['name1', 'name2', 'Chat mode']], [shared.gradio['display'], shared.gradio['textbox']], show_progress=False) + shared.gradio['download_button'].click(chat.save_history, inputs=[], outputs=[shared.gradio['download']]) + shared.gradio['Upload character'].click(chat.upload_character, [shared.gradio['upload_json'], shared.gradio['upload_img_bot']], [shared.gradio['character_menu']]) + + # Clearing stuff and saving the history + for i in ['Generate', 'Regenerate', 'Replace last reply']: + shared.gradio[i].click(lambda x: '', shared.gradio['textbox'], shared.gradio['textbox'], show_progress=False) + shared.gradio[i].click(lambda: chat.save_history(timestamp=False), [], [], show_progress=False) + shared.gradio['Clear history-confirm'].click(lambda: chat.save_history(timestamp=False), [], [], show_progress=False) + shared.gradio['textbox'].submit(lambda x: '', shared.gradio['textbox'], shared.gradio['textbox'], show_progress=False) + shared.gradio['textbox'].submit(lambda: chat.save_history(timestamp=False), [], [], show_progress=False) + + shared.gradio['character_menu'].change(chat.load_character, [shared.gradio[k] for k in ['character_menu', 'name1', 'name2', 'Chat mode']], [shared.gradio[k] for k in ['name1', 'name2', 'character_picture', 'greeting', 'context', 'end_of_turn', 'display']]) + shared.gradio['Instruction templates'].change(lambda character, name1, name2, mode: chat.load_character(character, name1, name2, mode), [shared.gradio[k] for k in ['Instruction templates', 'name1', 'name2', 'Chat mode']], [shared.gradio[k] for k in ['name1', 'name2', 'character_picture', 'greeting', 'context', 'end_of_turn', 'display']]) + shared.gradio['upload_chat_history'].upload(chat.load_history, [shared.gradio[k] for k in ['upload_chat_history', 'name1', 'name2']], []) + shared.gradio['upload_img_tavern'].upload(chat.upload_tavern_character, [shared.gradio['upload_img_tavern'], shared.gradio['name1'], shared.gradio['name2']], [shared.gradio['character_menu']]) + shared.gradio['your_picture'].change(chat.upload_your_profile_picture, [shared.gradio[k] for k in ['your_picture', 'name1', 'name2', 'Chat mode']], shared.gradio['display']) + + reload_inputs = [shared.gradio[k] for k in ['name1', 'name2', 'Chat mode']] + shared.gradio['upload_chat_history'].upload(chat.redraw_html, reload_inputs, [shared.gradio['display']]) + shared.gradio['Stop'].click(chat.redraw_html, reload_inputs, [shared.gradio['display']]) + shared.gradio['Instruction templates'].change(chat.redraw_html, reload_inputs, [shared.gradio['display']]) + shared.gradio['Chat mode'].change(chat.redraw_html, reload_inputs, [shared.gradio['display']]) + + shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.main_js+ui.chat_js}}}") + shared.gradio['interface'].load(lambda: chat.load_default_history(shared.settings['name1'], shared.settings['name2']), None, None) + shared.gradio['interface'].load(chat.redraw_html, reload_inputs, [shared.gradio['display']], show_progress=True) + + elif shared.args.notebook: + with gr.Tab("Text generation", elem_id="main"): + with gr.Row(): + with gr.Column(scale=4): + with gr.Tab('Raw'): + shared.gradio['textbox'] = gr.Textbox(value=default_text, elem_id="textbox", lines=27) + with gr.Tab('Markdown'): + shared.gradio['markdown'] = gr.Markdown() + with gr.Tab('HTML'): + shared.gradio['html'] = gr.HTML() + + with gr.Row(): + with gr.Column(): + with gr.Row(): + shared.gradio['Generate'] = gr.Button('Generate') + shared.gradio['Stop'] = gr.Button('Stop') + with gr.Column(): + pass + + with gr.Column(scale=1): + gr.HTML('
') + shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) + + create_prompt_menus() + + with gr.Tab("Parameters", elem_id="parameters"): + create_settings_menus(default_preset) + + shared.input_params = [shared.gradio[k] for k in ['textbox', 'generate_state']] + output_params = [shared.gradio[k] for k in ['textbox', 'markdown', 'html']] + gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream)) + shared.gradio['Stop'].click(stop_everything_event, [], [], queue=False, cancels=gen_events if shared.args.no_stream else None) + shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.main_js}}}") + + else: + with gr.Tab("Text generation", elem_id="main"): + with gr.Row(): + with gr.Column(): + shared.gradio['textbox'] = gr.Textbox(value=default_text, lines=21, label='Input') + shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], step=1, label='max_new_tokens', value=shared.settings['max_new_tokens']) + shared.gradio['Generate'] = gr.Button('Generate') + with gr.Row(): + with gr.Column(): + shared.gradio['Continue'] = gr.Button('Continue') + with gr.Column(): + shared.gradio['Stop'] = gr.Button('Stop') + + create_prompt_menus() + + with gr.Column(): + with gr.Tab('Raw'): + shared.gradio['output_textbox'] = gr.Textbox(lines=27, label='Output') + with gr.Tab('Markdown'): + shared.gradio['markdown'] = gr.Markdown() + with gr.Tab('HTML'): + shared.gradio['html'] = gr.HTML() + + with gr.Tab("Parameters", elem_id="parameters"): + create_settings_menus(default_preset) + + shared.input_params = [shared.gradio[k] for k in ['textbox', 'generate_state']] + output_params = [shared.gradio[k] for k in ['output_textbox', 'markdown', 'html']] + gen_events.append(shared.gradio['Generate'].click(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['textbox'].submit(generate_reply, shared.input_params, output_params, show_progress=shared.args.no_stream)) + gen_events.append(shared.gradio['Continue'].click(generate_reply, [shared.gradio['output_textbox']] + shared.input_params[1:], output_params, show_progress=shared.args.no_stream)) + shared.gradio['Stop'].click(stop_everything_event, [], [], queue=False, cancels=gen_events if shared.args.no_stream else None) + shared.gradio['interface'].load(None, None, None, _js=f"() => {{{ui.main_js}}}") + + with gr.Tab("Model", elem_id="model-tab"): + create_model_menus() + + with gr.Tab("Training", elem_id="training-tab"): + training.create_train_interface() + + with gr.Tab("Interface mode", elem_id="interface-mode"): + modes = ["default", "notebook", "chat", "cai_chat"] + current_mode = "default" + for mode in modes[1:]: + if eval(f"shared.args.{mode}"): + current_mode = mode + break + cmd_list = vars(shared.args) + bool_list = [k for k in cmd_list if type(cmd_list[k]) is bool and k not in modes] + bool_active = [k for k in bool_list if vars(shared.args)[k]] + + gr.Markdown("*Experimental*") + shared.gradio['interface_modes_menu'] = gr.Dropdown(choices=modes, value=current_mode, label="Mode") + shared.gradio['extensions_menu'] = gr.CheckboxGroup(choices=get_available_extensions(), value=shared.args.extensions, label="Available extensions") + shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=bool_list, value=bool_active, label="Boolean command-line flags") + shared.gradio['reset_interface'] = gr.Button("Apply and restart the interface") + + shared.gradio['reset_interface'].click(set_interface_arguments, [shared.gradio[k] for k in ['interface_modes_menu', 'extensions_menu', 'bool_menu']], None) + shared.gradio['reset_interface'].click(lambda: None, None, None, _js='() => {document.body.innerHTML=\'

Reloading...

\'; setTimeout(function(){location.reload()},2500); return []}') + + if shared.args.extensions is not None: + extensions_module.create_extensions_block() + + def change_dict_value(d, key, value): + d[key] = value + return d + + for k in ['do_sample', 'temperature', 'top_p', 'typical_p', 'repetition_penalty', 'encoder_repetition_penalty', 'top_k', 'min_length', 'no_repeat_ngram_size', 'num_beams', 'penalty_alpha', 'length_penalty', 'early_stopping', 'max_new_tokens', 'seed', 'stop_at_newline', 'chat_prompt_size_slider', 'chat_generation_attempts']: + if k not in shared.gradio: + continue + if type(shared.gradio[k]) in [gr.Checkbox, gr.Number]: + shared.gradio[k].change(lambda state, value, copy=k: change_dict_value(state, copy, value), inputs=[shared.gradio['generate_state'], shared.gradio[k]], outputs=shared.gradio['generate_state']) + else: + shared.gradio[k].release(lambda state, value, copy=k: change_dict_value(state, copy, value), inputs=[shared.gradio['generate_state'], shared.gradio[k]], outputs=shared.gradio['generate_state']) + + if not shared.is_chat(): + api.create_apis() + + # Authentication + auth = None + if shared.args.gradio_auth_path is not None: + gradio_auth_creds = [] + with open(shared.args.gradio_auth_path, 'r', encoding="utf8") as file: + for line in file.readlines(): + gradio_auth_creds += [x.strip() for x in line.split(',') if x.strip()] + auth = [tuple(cred.split(':')) for cred in gradio_auth_creds] + + # Launch the interface + shared.gradio['interface'].queue() + if shared.args.listen: + shared.gradio['interface'].launch(prevent_thread_lock=True, share=True, server_name='0.0.0.0', server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth) + else: + shared.gradio['interface'].launch(prevent_thread_lock=True, share=True, server_port=shared.args.listen_port, inbrowser=shared.args.auto_launch, auth=auth) + + +create_interface() + +while True: + time.sleep(0.5) + if shared.need_restart: + shared.need_restart = False + shared.gradio['interface'].close() + create_interface() diff --git a/text-generation-webui/settings-template.json b/text-generation-webui/settings-template.json new file mode 100644 index 0000000000000000000000000000000000000000..ea456fd62297c0532b2f6f79b2924a7546ef2621 --- /dev/null +++ b/text-generation-webui/settings-template.json @@ -0,0 +1,36 @@ +{ + "max_new_tokens": 200, + "max_new_tokens_min": 1, + "max_new_tokens_max": 2000, + "seed": -1, + "name1": "You", + "name2": "Assistant", + "context": "This is a conversation with your Assistant. The Assistant is very helpful and is eager to chat with you and answer your questions.", + "greeting": "Hello there!", + "stop_at_newline": false, + "chat_prompt_size": 2048, + "chat_prompt_size_min": 0, + "chat_prompt_size_max": 2048, + "chat_generation_attempts": 1, + "chat_generation_attempts_min": 1, + "chat_generation_attempts_max": 5, + "default_extensions": [], + "chat_default_extensions": [ + "gallery" + ], + "presets": { + "default": "NovelAI-Sphinx Moth", + ".*pygmalion": "NovelAI-Storywriter", + ".*RWKV": "Naive" + }, + "prompts": { + "default": "QA", + ".*(gpt4chan|gpt-4chan|4chan)": "GPT-4chan", + ".*oasst": "Open Assistant", + ".*alpaca": "Alpaca" + }, + "lora_prompts": { + "default": "QA", + ".*(alpaca-lora-7b|alpaca-lora-13b|alpaca-lora-30b)": "Alpaca" + } +} diff --git a/text-generation-webui/softprompts/place-your-softprompts-here.txt b/text-generation-webui/softprompts/place-your-softprompts-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/training/datasets/put-trainer-datasets-here.txt b/text-generation-webui/training/datasets/put-trainer-datasets-here.txt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/text-generation-webui/training/formats/alpaca-chatbot-format.json b/text-generation-webui/training/formats/alpaca-chatbot-format.json new file mode 100644 index 0000000000000000000000000000000000000000..4b38103f4c23de004666e0316855db62e57d2ad0 --- /dev/null +++ b/text-generation-webui/training/formats/alpaca-chatbot-format.json @@ -0,0 +1,4 @@ +{ + "instruction,output": "User: %instruction%\nAssistant: %output%", + "instruction,input,output": "User: %instruction%: %input%\nAssistant: %output%" +} diff --git a/text-generation-webui/training/formats/alpaca-format.json b/text-generation-webui/training/formats/alpaca-format.json new file mode 100644 index 0000000000000000000000000000000000000000..dd6df95640360297257b618715370093b715b21f --- /dev/null +++ b/text-generation-webui/training/formats/alpaca-format.json @@ -0,0 +1,4 @@ +{ + "instruction,output": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Response:\n%output%", + "instruction,input,output": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n%instruction%\n\n### Input:\n%input%\n\n### Response:\n%output%" +}