diff --git a/src/AntSK.Domain/Domain/Service/LLamaFactoryService.cs b/src/AntSK.Domain/Domain/Service/LLamaFactoryService.cs index b215ba1..d07cd7b 100644 --- a/src/AntSK.Domain/Domain/Service/LLamaFactoryService.cs +++ b/src/AntSK.Domain/Domain/Service/LLamaFactoryService.cs @@ -87,7 +87,7 @@ namespace AntSK.Domain.Domain.Service StartInfo = new ProcessStartInfo { FileName = "python", - Arguments = "api_demo.py --model_name_or_path " + modelName + " --template " + templateName + " ", + Arguments = "api_antsk.py --model_name_or_path " + modelName + " --template " + templateName + " ", UseShellExecute = false, RedirectStandardOutput = true, RedirectStandardError=true, diff --git a/src/AntSK.LLamaFactory/llamafactory/api_antsk.py b/src/AntSK.LLamaFactory/llamafactory/api_antsk.py new file mode 100644 index 0000000..3655e39 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/api_antsk.py @@ -0,0 +1,19 @@ +import os + +import uvicorn + +from llamafactory.api.app import create_app +from llamafactory.chat import ChatModel + + +def main(): + chat_model = ChatModel() + app = create_app(chat_model) + api_host = os.environ.get("API_HOST", "0.0.0.0") + api_port = int(os.environ.get("API_PORT", "8000")) + print("Visit http://localhost:{}/docs for API document.".format(api_port)) + uvicorn.run(app, host=api_host, port=api_port) + + +if __name__ == "__main__": + main() diff --git a/src/AntSK.LLamaFactory/llamafactory/api_demo.py b/src/AntSK.LLamaFactory/llamafactory/api_demo.py deleted file mode 100644 index a714067..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/api_demo.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -import uvicorn - -from llmtuner import ChatModel, create_app - - -def main(): - chat_model = ChatModel() - app = create_app(chat_model) - print("Visit http://localhost:{}/docs for API document.".format(os.environ.get("API_PORT", 8000))) - uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8000)), workers=1) - - -if __name__ == "__main__": - main() diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/__init__.py new file mode 100644 index 0000000..7823093 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/__init__.py @@ -0,0 +1,6 @@ +# Level: api, webui > chat, eval, train > data, model > hparams > extras + +from .cli import VERSION + + +__version__ = VERSION diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/api/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/app.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/app.py new file mode 100644 index 0000000..21edab2 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/app.py @@ -0,0 +1,108 @@ +import os +from contextlib import asynccontextmanager +from typing import Optional + +from typing_extensions import Annotated + +from ..chat import ChatModel +from ..extras.misc import torch_gc +from ..extras.packages import is_fastapi_available, is_starlette_available, is_uvicorn_available +from .chat import ( + create_chat_completion_response, + create_score_evaluation_response, + create_stream_chat_completion_response, +) +from .protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ModelCard, + ModelList, + ScoreEvaluationRequest, + ScoreEvaluationResponse, +) + + +if is_fastapi_available(): + from fastapi import Depends, FastAPI, HTTPException, status + from fastapi.middleware.cors import CORSMiddleware + from fastapi.security.http import HTTPAuthorizationCredentials, HTTPBearer + + +if is_starlette_available(): + from sse_starlette import EventSourceResponse + + +if is_uvicorn_available(): + import uvicorn + + +@asynccontextmanager +async def lifespan(app: "FastAPI"): # collects GPU memory + yield + torch_gc() + + +def create_app(chat_model: "ChatModel") -> "FastAPI": + app = FastAPI(lifespan=lifespan) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + api_key = os.environ.get("API_KEY") + security = HTTPBearer(auto_error=False) + + async def verify_api_key(auth: Annotated[Optional[HTTPAuthorizationCredentials], Depends(security)]): + if api_key and (auth is None or auth.credentials != api_key): + raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key.") + + @app.get( + "/v1/models", + response_model=ModelList, + status_code=status.HTTP_200_OK, + dependencies=[Depends(verify_api_key)], + ) + async def list_models(): + model_card = ModelCard(id="gpt-3.5-turbo") + return ModelList(data=[model_card]) + + @app.post( + "/v1/chat/completions", + response_model=ChatCompletionResponse, + status_code=status.HTTP_200_OK, + dependencies=[Depends(verify_api_key)], + ) + async def create_chat_completion(request: ChatCompletionRequest): + if not chat_model.engine.can_generate: + raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed") + + if request.stream: + generate = create_stream_chat_completion_response(request, chat_model) + return EventSourceResponse(generate, media_type="text/event-stream") + else: + return await create_chat_completion_response(request, chat_model) + + @app.post( + "/v1/score/evaluation", + response_model=ScoreEvaluationResponse, + status_code=status.HTTP_200_OK, + dependencies=[Depends(verify_api_key)], + ) + async def create_score_evaluation(request: ScoreEvaluationRequest): + if chat_model.engine.can_generate: + raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed") + + return await create_score_evaluation_response(request, chat_model) + + return app + + +def run_api() -> None: + chat_model = ChatModel() + app = create_app(chat_model) + api_host = os.environ.get("API_HOST", "0.0.0.0") + api_port = int(os.environ.get("API_PORT", "8000")) + print("Visit http://localhost:{}/docs for API document.".format(api_port)) + uvicorn.run(app, host=api_host, port=api_port) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/chat.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/chat.py new file mode 100644 index 0000000..98957bc --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/chat.py @@ -0,0 +1,219 @@ +import base64 +import io +import json +import os +import uuid +from typing import TYPE_CHECKING, AsyncGenerator, Dict, List, Optional, Tuple + +from ..data import Role as DataRole +from ..extras.logging import get_logger +from ..extras.packages import is_fastapi_available, is_pillow_available, is_requests_available +from .common import dictify, jsonify +from .protocol import ( + ChatCompletionMessage, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseUsage, + ChatCompletionStreamResponse, + ChatCompletionStreamResponseChoice, + Finish, + Function, + FunctionCall, + Role, + ScoreEvaluationResponse, +) + + +if is_fastapi_available(): + from fastapi import HTTPException, status + + +if is_pillow_available(): + from PIL import Image + + +if is_requests_available(): + import requests + + +if TYPE_CHECKING: + from numpy.typing import NDArray + + from ..chat import ChatModel + from .protocol import ChatCompletionRequest, ScoreEvaluationRequest + + +logger = get_logger(__name__) +ROLE_MAPPING = { + Role.USER: DataRole.USER.value, + Role.ASSISTANT: DataRole.ASSISTANT.value, + Role.SYSTEM: DataRole.SYSTEM.value, + Role.FUNCTION: DataRole.FUNCTION.value, + Role.TOOL: DataRole.OBSERVATION.value, +} + + +def _process_request( + request: "ChatCompletionRequest", +) -> Tuple[List[Dict[str, str]], Optional[str], Optional[str], Optional["NDArray"]]: + logger.info("==== request ====\n{}".format(json.dumps(dictify(request), indent=2, ensure_ascii=False))) + + if len(request.messages) == 0: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length") + + if request.messages[0].role == Role.SYSTEM: + system = request.messages.pop(0).content + else: + system = None + + if len(request.messages) % 2 == 0: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...") + + input_messages = [] + image = None + for i, message in enumerate(request.messages): + if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role") + elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role") + + if message.role == Role.ASSISTANT and isinstance(message.tool_calls, list) and len(message.tool_calls): + name = message.tool_calls[0].function.name + arguments = message.tool_calls[0].function.arguments + content = json.dumps({"name": name, "argument": arguments}, ensure_ascii=False) + input_messages.append({"role": ROLE_MAPPING[Role.FUNCTION], "content": content}) + elif isinstance(message.content, list): + for input_item in message.content: + if input_item.type == "text": + input_messages.append({"role": ROLE_MAPPING[message.role], "content": input_item.text}) + else: + image_url = input_item.image_url.url + if image_url.startswith("data:image"): # base64 image + image_data = base64.b64decode(image_url.split(",", maxsplit=1)[1]) + image_path = io.BytesIO(image_data) + elif os.path.isfile(image_url): # local file + image_path = open(image_url, "rb") + else: # web uri + image_path = requests.get(image_url, stream=True).raw + + image = Image.open(image_path).convert("RGB") + else: + input_messages.append({"role": ROLE_MAPPING[message.role], "content": message.content}) + + tool_list = request.tools + if isinstance(tool_list, list) and len(tool_list): + try: + tools = json.dumps([dictify(tool.function) for tool in tool_list], ensure_ascii=False) + except Exception: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools") + else: + tools = None + + return input_messages, system, tools, image + + +def _create_stream_chat_completion_chunk( + completion_id: str, + model: str, + delta: "ChatCompletionMessage", + index: Optional[int] = 0, + finish_reason: Optional["Finish"] = None, +) -> str: + choice_data = ChatCompletionStreamResponseChoice(index=index, delta=delta, finish_reason=finish_reason) + chunk = ChatCompletionStreamResponse(id=completion_id, model=model, choices=[choice_data]) + return jsonify(chunk) + + +async def create_chat_completion_response( + request: "ChatCompletionRequest", chat_model: "ChatModel" +) -> "ChatCompletionResponse": + completion_id = "chatcmpl-{}".format(uuid.uuid4().hex) + input_messages, system, tools, image = _process_request(request) + responses = await chat_model.achat( + input_messages, + system, + tools, + image, + do_sample=request.do_sample, + temperature=request.temperature, + top_p=request.top_p, + max_new_tokens=request.max_tokens, + num_return_sequences=request.n, + stop=request.stop, + ) + + prompt_length, response_length = 0, 0 + choices = [] + for i, response in enumerate(responses): + if tools: + result = chat_model.engine.template.format_tools.extract(response.response_text) + else: + result = response.response_text + + if isinstance(result, tuple): + name, arguments = result + function = Function(name=name, arguments=arguments) + tool_call = FunctionCall(id="call_{}".format(uuid.uuid4().hex), function=function) + response_message = ChatCompletionMessage(role=Role.ASSISTANT, tool_calls=[tool_call]) + finish_reason = Finish.TOOL + else: + response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result) + finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH + + choices.append(ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason)) + prompt_length = response.prompt_length + response_length += response.response_length + + usage = ChatCompletionResponseUsage( + prompt_tokens=prompt_length, + completion_tokens=response_length, + total_tokens=prompt_length + response_length, + ) + + return ChatCompletionResponse(id=completion_id, model=request.model, choices=choices, usage=usage) + + +async def create_stream_chat_completion_response( + request: "ChatCompletionRequest", chat_model: "ChatModel" +) -> AsyncGenerator[str, None]: + completion_id = "chatcmpl-{}".format(uuid.uuid4().hex) + input_messages, system, tools, image = _process_request(request) + if tools: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.") + + if request.n > 1: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream multiple responses.") + + yield _create_stream_chat_completion_chunk( + completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(role=Role.ASSISTANT, content="") + ) + async for new_token in chat_model.astream_chat( + input_messages, + system, + tools, + image, + do_sample=request.do_sample, + temperature=request.temperature, + top_p=request.top_p, + max_new_tokens=request.max_tokens, + stop=request.stop, + ): + if len(new_token) != 0: + yield _create_stream_chat_completion_chunk( + completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(content=new_token) + ) + + yield _create_stream_chat_completion_chunk( + completion_id=completion_id, model=request.model, delta=ChatCompletionMessage(), finish_reason=Finish.STOP + ) + yield "[DONE]" + + +async def create_score_evaluation_response( + request: "ScoreEvaluationRequest", chat_model: "ChatModel" +) -> "ScoreEvaluationResponse": + if len(request.messages) == 0: + raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request") + + scores = await chat_model.aget_scores(request.messages, max_length=request.max_length) + return ScoreEvaluationResponse(model=request.model, scores=scores) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/common.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/common.py new file mode 100644 index 0000000..5ad9a07 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/common.py @@ -0,0 +1,20 @@ +import json +from typing import TYPE_CHECKING, Any, Dict + + +if TYPE_CHECKING: + from pydantic import BaseModel + + +def dictify(data: "BaseModel") -> Dict[str, Any]: + try: # pydantic v2 + return data.model_dump(exclude_unset=True) + except AttributeError: # pydantic v1 + return data.dict(exclude_unset=True) + + +def jsonify(data: "BaseModel") -> str: + try: # pydantic v2 + return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False) + except AttributeError: # pydantic v1 + return data.json(exclude_unset=True, ensure_ascii=False) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/protocol.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/protocol.py similarity index 72% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/api/protocol.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/api/protocol.py index 3e39fe0..055fa78 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/protocol.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/api/protocol.py @@ -1,6 +1,6 @@ import time from enum import Enum, unique -from typing import List, Optional +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field from typing_extensions import Literal @@ -39,15 +39,37 @@ class Function(BaseModel): arguments: str +class FunctionDefinition(BaseModel): + name: str + description: str + parameters: Dict[str, Any] + + +class FunctionAvailable(BaseModel): + type: Literal["function", "code_interpreter"] = "function" + function: Optional[FunctionDefinition] = None + + class FunctionCall(BaseModel): - id: Literal["call_default"] = "call_default" + id: str type: Literal["function"] = "function" function: Function +class ImageURL(BaseModel): + url: str + + +class MultimodalInputItem(BaseModel): + type: Literal["text", "image_url"] + text: Optional[str] = None + image_url: Optional[ImageURL] = None + + class ChatMessage(BaseModel): role: Role - content: str + content: Optional[Union[str, List[MultimodalInputItem]]] = None + tool_calls: Optional[List[FunctionCall]] = None class ChatCompletionMessage(BaseModel): @@ -59,12 +81,13 @@ class ChatCompletionMessage(BaseModel): class ChatCompletionRequest(BaseModel): model: str messages: List[ChatMessage] - tools: list = [] + tools: Optional[List[FunctionAvailable]] = None do_sample: bool = True temperature: Optional[float] = None top_p: Optional[float] = None n: int = 1 max_tokens: Optional[int] = None + stop: Optional[Union[str, List[str]]] = None stream: bool = False @@ -74,7 +97,7 @@ class ChatCompletionResponseChoice(BaseModel): finish_reason: Finish -class ChatCompletionResponseStreamChoice(BaseModel): +class ChatCompletionStreamResponseChoice(BaseModel): index: int delta: ChatCompletionMessage finish_reason: Optional[Finish] = None @@ -87,7 +110,7 @@ class ChatCompletionResponseUsage(BaseModel): class ChatCompletionResponse(BaseModel): - id: Literal["chatcmpl-default"] = "chatcmpl-default" + id: str object: Literal["chat.completion"] = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) model: str @@ -96,11 +119,11 @@ class ChatCompletionResponse(BaseModel): class ChatCompletionStreamResponse(BaseModel): - id: Literal["chatcmpl-default"] = "chatcmpl-default" + id: str object: Literal["chat.completion.chunk"] = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseStreamChoice] + choices: List[ChatCompletionStreamResponseChoice] class ScoreEvaluationRequest(BaseModel): @@ -110,7 +133,7 @@ class ScoreEvaluationRequest(BaseModel): class ScoreEvaluationResponse(BaseModel): - id: Literal["scoreeval-default"] = "scoreeval-default" + id: str object: Literal["score.evaluation"] = "score.evaluation" model: str scores: List[float] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/base_engine.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/base_engine.py similarity index 91% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/base_engine.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/base_engine.py index c5db41d..65b6c59 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/base_engine.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/base_engine.py @@ -4,15 +4,13 @@ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Literal, Opti if TYPE_CHECKING: + from numpy.typing import NDArray from transformers import PreTrainedModel, PreTrainedTokenizer + from vllm import AsyncLLMEngine from ..data import Template - from ..extras.packages import is_vllm_available from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments - if is_vllm_available(): - from vllm import AsyncLLMEngine - @dataclass class Response: @@ -49,6 +47,7 @@ class BaseEngine(ABC): messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> List["Response"]: ... @@ -58,6 +57,7 @@ class BaseEngine(ABC): messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> AsyncGenerator[str, None]: ... diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/chat_model.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/chat_model.py similarity index 63% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/chat_model.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/chat_model.py index c49d4d7..281ef0c 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/chat_model.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/chat_model.py @@ -2,12 +2,15 @@ import asyncio from threading import Thread from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, Generator, List, Optional, Sequence +from ..extras.misc import torch_gc from ..hparams import get_infer_args from .hf_engine import HuggingfaceEngine from .vllm_engine import VllmEngine if TYPE_CHECKING: + from numpy.typing import NDArray + from .base_engine import BaseEngine, Response @@ -36,9 +39,10 @@ class ChatModel: messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> List["Response"]: - task = asyncio.run_coroutine_threadsafe(self.achat(messages, system, tools, **input_kwargs), self._loop) + task = asyncio.run_coroutine_threadsafe(self.achat(messages, system, tools, image, **input_kwargs), self._loop) return task.result() async def achat( @@ -46,18 +50,20 @@ class ChatModel: messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> List["Response"]: - return await self.engine.chat(messages, system, tools, **input_kwargs) + return await self.engine.chat(messages, system, tools, image, **input_kwargs) def stream_chat( self, messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> Generator[str, None, None]: - generator = self.astream_chat(messages, system, tools, **input_kwargs) + generator = self.astream_chat(messages, system, tools, image, **input_kwargs) while True: try: task = asyncio.run_coroutine_threadsafe(generator.__anext__(), self._loop) @@ -70,9 +76,10 @@ class ChatModel: messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> AsyncGenerator[str, None]: - async for new_token in self.engine.stream_chat(messages, system, tools, **input_kwargs): + async for new_token in self.engine.stream_chat(messages, system, tools, image, **input_kwargs): yield new_token def get_scores( @@ -89,3 +96,45 @@ class ChatModel: **input_kwargs, ) -> List[float]: return await self.engine.get_scores(batch_input, **input_kwargs) + + +def run_chat() -> None: + try: + import platform + + if platform.system() != "Windows": + import readline # noqa: F401 + except ImportError: + print("Install `readline` for a better experience.") + + chat_model = ChatModel() + messages = [] + print("Welcome to the CLI application, use `clear` to remove the history, use `exit` to exit the application.") + + while True: + try: + query = input("\nUser: ") + except UnicodeDecodeError: + print("Detected decoding error at the inputs, please set the terminal encoding to utf-8.") + continue + except Exception: + raise + + if query.strip() == "exit": + break + + if query.strip() == "clear": + messages = [] + torch_gc() + print("History has been removed.") + continue + + messages.append({"role": "user", "content": query}) + print("Assistant: ", end="", flush=True) + + response = "" + for new_text in chat_model.stream_chat(messages): + print(new_text, end="", flush=True) + response += new_text + print() + messages.append({"role": "assistant", "content": response}) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/hf_engine.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/hf_engine.py similarity index 66% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/hf_engine.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/hf_engine.py index c634ba1..28e6a40 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/hf_engine.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/hf_engine.py @@ -2,25 +2,31 @@ import asyncio import concurrent.futures import os from threading import Thread -from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, Any, AsyncGenerator, Callable, Dict, List, Optional, Sequence, Tuple, Union import torch from transformers import GenerationConfig, TextIteratorStreamer from ..data import get_template_and_fix_tokenizer +from ..extras.logging import get_logger from ..extras.misc import get_logits_processor -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .base_engine import BaseEngine, Response if TYPE_CHECKING: - from transformers import PreTrainedModel, PreTrainedTokenizer + from numpy.typing import NDArray + from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin + from transformers.image_processing_utils import BaseImageProcessor from trl import PreTrainedModelWrapper from ..data import Template from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments +logger = get_logger(__name__) + + class HuggingfaceEngine(BaseEngine): def __init__( self, @@ -30,55 +36,96 @@ class HuggingfaceEngine(BaseEngine): generating_args: "GeneratingArguments", ) -> None: self.can_generate = finetuning_args.stage == "sft" - self.model, self.tokenizer = load_model_and_tokenizer( - model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) - ) + tokenizer_module = load_tokenizer(model_args) + self.tokenizer = tokenizer_module["tokenizer"] + self.processor = tokenizer_module["processor"] self.tokenizer.padding_side = "left" if self.can_generate else "right" self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template) + self.model = load_model( + self.tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=(not self.can_generate) + ) # must after fixing tokenizer to resize vocab self.generating_args = generating_args.to_dict() @staticmethod def _process_args( model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, ) -> Tuple[Dict[str, Any], int]: + if ( + processor is not None + and image is not None + and not hasattr(processor, "image_seq_length") + and template.image_token not in messages[0]["content"] + ): # llava-like models + messages[0]["content"] = template.image_token + messages[0]["content"] + paired_messages = messages + [{"role": "assistant", "content": ""}] + system = system or generating_args["default_system"] + pixel_values = None prompt_ids, _ = template.encode_oneturn( tokenizer=tokenizer, messages=paired_messages, system=system, tools=tools ) + if processor is not None and image is not None: # add image features + image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + batch_feature = image_processor(image, return_tensors="pt") + pixel_values = batch_feature.to(model.device)["pixel_values"] # shape (B, C, H, W) + if hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + prompt_length = len(prompt_ids) inputs = torch.tensor([prompt_ids], device=model.device) + attention_mask = torch.ones_like(inputs, dtype=torch.bool) - do_sample = input_kwargs.pop("do_sample", None) - temperature = input_kwargs.pop("temperature", None) - top_p = input_kwargs.pop("top_p", None) - top_k = input_kwargs.pop("top_k", None) - num_return_sequences = input_kwargs.pop("num_return_sequences", None) - repetition_penalty = input_kwargs.pop("repetition_penalty", None) - max_length = input_kwargs.pop("max_length", None) - max_new_tokens = input_kwargs.pop("max_new_tokens", None) + do_sample: Optional[bool] = input_kwargs.pop("do_sample", None) + temperature: Optional[float] = input_kwargs.pop("temperature", None) + top_p: Optional[float] = input_kwargs.pop("top_p", None) + top_k: Optional[float] = input_kwargs.pop("top_k", None) + num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1) + repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None) + length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None) + max_length: Optional[int] = input_kwargs.pop("max_length", None) + max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None) + stop: Optional[Union[str, List[str]]] = input_kwargs.pop("stop", None) + if stop is not None: + logger.warning("Stop parameter is not supported in Huggingface engine yet.") + + generating_args = generating_args.copy() generating_args.update( dict( do_sample=do_sample if do_sample is not None else generating_args["do_sample"], - temperature=temperature or generating_args["temperature"], - top_p=top_p or generating_args["top_p"], - top_k=top_k or generating_args["top_k"], - num_return_sequences=num_return_sequences or 1, - repetition_penalty=repetition_penalty or generating_args["repetition_penalty"], + temperature=temperature if temperature is not None else generating_args["temperature"], + top_p=top_p if top_p is not None else generating_args["top_p"], + top_k=top_k if top_k is not None else generating_args["top_k"], + num_return_sequences=num_return_sequences, + repetition_penalty=repetition_penalty + if repetition_penalty is not None + else generating_args["repetition_penalty"], + length_penalty=length_penalty if length_penalty is not None else generating_args["length_penalty"], eos_token_id=[tokenizer.eos_token_id] + tokenizer.additional_special_tokens_ids, pad_token_id=tokenizer.pad_token_id, ) ) - if isinstance(num_return_sequences, int) and num_return_sequences > 1: + if isinstance(num_return_sequences, int) and num_return_sequences > 1: # do_sample needs temperature > 0 generating_args["do_sample"] = True + generating_args["temperature"] = generating_args["temperature"] or 1.0 + + if not generating_args["temperature"]: + generating_args["do_sample"] = False + + if not generating_args["do_sample"]: + generating_args.pop("temperature", None) + generating_args.pop("top_p", None) if max_length: generating_args.pop("max_new_tokens", None) @@ -90,10 +137,14 @@ class HuggingfaceEngine(BaseEngine): gen_kwargs = dict( inputs=inputs, + attention_mask=attention_mask, generation_config=GenerationConfig(**generating_args), logits_processor=get_logits_processor(), ) + if pixel_values is not None: + gen_kwargs["pixel_values"] = pixel_values + return gen_kwargs, prompt_length @staticmethod @@ -101,15 +152,17 @@ class HuggingfaceEngine(BaseEngine): def _chat( model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, ) -> List["Response"]: gen_kwargs, prompt_length = HuggingfaceEngine._process_args( - model, tokenizer, template, generating_args, messages, system, tools, input_kwargs + model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs ) generate_output = model.generate(**gen_kwargs) response_ids = generate_output[:, prompt_length:] @@ -134,15 +187,17 @@ class HuggingfaceEngine(BaseEngine): def _stream_chat( model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], template: "Template", generating_args: Dict[str, Any], messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, input_kwargs: Optional[Dict[str, Any]] = {}, ) -> Callable[[], str]: gen_kwargs, _ = HuggingfaceEngine._process_args( - model, tokenizer, template, generating_args, messages, system, tools, input_kwargs + model, tokenizer, processor, template, generating_args, messages, system, tools, image, input_kwargs ) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs["streamer"] = streamer @@ -198,6 +253,7 @@ class HuggingfaceEngine(BaseEngine): messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> List["Response"]: if not self.can_generate: @@ -207,11 +263,13 @@ class HuggingfaceEngine(BaseEngine): input_args = ( self.model, self.tokenizer, + self.processor, self.template, self.generating_args, messages, system, tools, + image, input_kwargs, ) async with self._semaphore: @@ -223,6 +281,7 @@ class HuggingfaceEngine(BaseEngine): messages: Sequence[Dict[str, str]], system: Optional[str] = None, tools: Optional[str] = None, + image: Optional["NDArray"] = None, **input_kwargs, ) -> AsyncGenerator[str, None]: if not self.can_generate: @@ -232,11 +291,13 @@ class HuggingfaceEngine(BaseEngine): input_args = ( self.model, self.tokenizer, + self.processor, self.template, self.generating_args, messages, system, tools, + image, input_kwargs, ) async with self._semaphore: diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/vllm_engine.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/vllm_engine.py new file mode 100644 index 0000000..87ce868 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/chat/vllm_engine.py @@ -0,0 +1,214 @@ +import uuid +from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence, Union + +from ..data import get_template_and_fix_tokenizer +from ..extras.logging import get_logger +from ..extras.misc import get_device_count +from ..extras.packages import is_vllm_available +from ..model import load_config, load_tokenizer +from ..model.model_utils.visual import LlavaMultiModalProjectorForYiVLForVLLM +from .base_engine import BaseEngine, Response + + +if is_vllm_available(): + from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams + from vllm.lora.request import LoRARequest + from vllm.sequence import MultiModalData + + +if TYPE_CHECKING: + from numpy.typing import NDArray + from transformers.image_processing_utils import BaseImageProcessor + + from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments + + +logger = get_logger(__name__) + + +class VllmEngine(BaseEngine): + def __init__( + self, + model_args: "ModelArguments", + data_args: "DataArguments", + finetuning_args: "FinetuningArguments", + generating_args: "GeneratingArguments", + ) -> None: + config = load_config(model_args) # may download model from ms hub + + self.can_generate = finetuning_args.stage == "sft" + tokenizer_module = load_tokenizer(model_args) + self.tokenizer = tokenizer_module["tokenizer"] + self.processor = tokenizer_module["processor"] + self.tokenizer.padding_side = "left" + self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template) + self.generating_args = generating_args.to_dict() + + engine_args = { + "model": model_args.model_name_or_path, + "trust_remote_code": True, + "download_dir": model_args.cache_dir, + "dtype": model_args.vllm_dtype, + "max_model_len": model_args.vllm_maxlen, + "tensor_parallel_size": get_device_count() or 1, + "gpu_memory_utilization": model_args.vllm_gpu_util, + "disable_log_stats": True, + "disable_log_requests": True, + "enforce_eager": model_args.vllm_enforce_eager, + "enable_lora": model_args.adapter_name_or_path is not None, + "max_lora_rank": model_args.vllm_max_lora_rank, + } + + if model_args.visual_inputs: + image_size = config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.image_feature_size = (image_size // patch_size) ** 2 + engine_args["image_input_type"] = "pixel_values" + engine_args["image_token_id"] = self.tokenizer.convert_tokens_to_ids(self.template.image_token) + engine_args["image_input_shape"] = "1,3,{},{}".format(image_size, image_size) + engine_args["image_feature_size"] = self.image_feature_size + if getattr(config, "is_yi_vl_derived_model", None): + import vllm.model_executor.models.llava + + logger.info("Detected Yi-VL model, applying projector patch.") + vllm.model_executor.models.llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVLForVLLM + + self.model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**engine_args)) + if model_args.adapter_name_or_path is not None: + self.lora_request = LoRARequest("default", 1, model_args.adapter_name_or_path[0]) + else: + self.lora_request = None + + async def _generate( + self, + messages: Sequence[Dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + image: Optional["NDArray"] = None, + **input_kwargs, + ) -> AsyncIterator["RequestOutput"]: + request_id = "chatcmpl-{}".format(uuid.uuid4().hex) + + if ( + self.processor is not None + and image is not None + and not hasattr(self.processor, "image_seq_length") + and self.template.image_token not in messages[0]["content"] + ): # llava-like models (TODO: paligemma models) + messages[0]["content"] = self.template.image_token * self.image_feature_size + messages[0]["content"] + + paired_messages = messages + [{"role": "assistant", "content": ""}] + system = system or self.generating_args["default_system"] + prompt_ids, _ = self.template.encode_oneturn( + tokenizer=self.tokenizer, messages=paired_messages, system=system, tools=tools + ) + + if self.processor is not None and image is not None: # add image features + image_processor: "BaseImageProcessor" = getattr(self.processor, "image_processor") + pixel_values = image_processor(image, return_tensors="pt")["pixel_values"] + multi_modal_data = MultiModalData(type=MultiModalData.Type.IMAGE, data=pixel_values) + else: + multi_modal_data = None + + prompt_length = len(prompt_ids) + + use_beam_search: bool = self.generating_args["num_beams"] > 1 + temperature: Optional[float] = input_kwargs.pop("temperature", None) + top_p: Optional[float] = input_kwargs.pop("top_p", None) + top_k: Optional[float] = input_kwargs.pop("top_k", None) + num_return_sequences: int = input_kwargs.pop("num_return_sequences", 1) + repetition_penalty: Optional[float] = input_kwargs.pop("repetition_penalty", None) + length_penalty: Optional[float] = input_kwargs.pop("length_penalty", None) + max_length: Optional[int] = input_kwargs.pop("max_length", None) + max_new_tokens: Optional[int] = input_kwargs.pop("max_new_tokens", None) + stop: Optional[Union[str, List[str]]] = input_kwargs.pop("stop", None) + + if "max_new_tokens" in self.generating_args: + max_tokens = self.generating_args["max_new_tokens"] + elif "max_length" in self.generating_args: + if self.generating_args["max_length"] > prompt_length: + max_tokens = self.generating_args["max_length"] - prompt_length + else: + max_tokens = 1 + + if max_length: + max_tokens = max_length - prompt_length if max_length > prompt_length else 1 + + if max_new_tokens: + max_tokens = max_new_tokens + + sampling_params = SamplingParams( + n=num_return_sequences, + repetition_penalty=( + repetition_penalty if repetition_penalty is not None else self.generating_args["repetition_penalty"] + ) + or 1.0, # repetition_penalty must > 0 + temperature=temperature if temperature is not None else self.generating_args["temperature"], + top_p=(top_p if top_p is not None else self.generating_args["top_p"]) or 1.0, # top_p must > 0 + top_k=top_k if top_k is not None else self.generating_args["top_k"], + use_beam_search=use_beam_search, + length_penalty=length_penalty if length_penalty is not None else self.generating_args["length_penalty"], + stop=stop, + stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids, + max_tokens=max_tokens, + skip_special_tokens=True, + ) + + result_generator = self.model.generate( + inputs={"prompt_token_ids": prompt_ids, "multi_modal_data": multi_modal_data}, + sampling_params=sampling_params, + request_id=request_id, + lora_request=self.lora_request, + ) + return result_generator + + async def start(self) -> None: + pass + + async def chat( + self, + messages: Sequence[Dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + image: Optional["NDArray"] = None, + **input_kwargs, + ) -> List["Response"]: + final_output = None + generator = await self._generate(messages, system, tools, image, **input_kwargs) + async for request_output in generator: + final_output = request_output + + results = [] + for output in final_output.outputs: + results.append( + Response( + response_text=output.text, + response_length=len(output.token_ids), + prompt_length=len(final_output.prompt_token_ids), + finish_reason=output.finish_reason, + ) + ) + + return results + + async def stream_chat( + self, + messages: Sequence[Dict[str, str]], + system: Optional[str] = None, + tools: Optional[str] = None, + image: Optional["NDArray"] = None, + **input_kwargs, + ) -> AsyncGenerator[str, None]: + generated_text = "" + generator = await self._generate(messages, system, tools, image, **input_kwargs) + async for result in generator: + delta_text = result.outputs[0].text[len(generated_text) :] + generated_text = result.outputs[0].text + yield delta_text + + async def get_scores( + self, + batch_input: List[str], + **input_kwargs, + ) -> List[float]: + raise NotImplementedError("vLLM engine does not support get_scores.") diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/cli.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/cli.py new file mode 100644 index 0000000..5042e53 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/cli.py @@ -0,0 +1,106 @@ +import os +import random +import subprocess +import sys +from enum import Enum, unique + +from . import launcher +from .api.app import run_api +from .chat.chat_model import run_chat +from .eval.evaluator import run_eval +from .extras.env import VERSION, print_env +from .extras.logging import get_logger +from .extras.misc import get_device_count +from .train.tuner import export_model, run_exp +from .webui.interface import run_web_demo, run_web_ui + + +USAGE = ( + "-" * 70 + + "\n" + + "| Usage: |\n" + + "| llamafactory-cli api -h: launch an OpenAI-style API server |\n" + + "| llamafactory-cli chat -h: launch a chat interface in CLI |\n" + + "| llamafactory-cli eval -h: evaluate models |\n" + + "| llamafactory-cli export -h: merge LoRA adapters and export model |\n" + + "| llamafactory-cli train -h: train models |\n" + + "| llamafactory-cli webchat -h: launch a chat interface in Web UI |\n" + + "| llamafactory-cli webui: launch LlamaBoard |\n" + + "| llamafactory-cli version: show version info |\n" + + "-" * 70 +) + +WELCOME = ( + "-" * 58 + + "\n" + + "| Welcome to LLaMA Factory, version {}".format(VERSION) + + " " * (21 - len(VERSION)) + + "|\n|" + + " " * 56 + + "|\n" + + "| Project page: https://github.com/hiyouga/LLaMA-Factory |\n" + + "-" * 58 +) + +logger = get_logger(__name__) + + +@unique +class Command(str, Enum): + API = "api" + CHAT = "chat" + ENV = "env" + EVAL = "eval" + EXPORT = "export" + TRAIN = "train" + WEBDEMO = "webchat" + WEBUI = "webui" + VER = "version" + HELP = "help" + + +def main(): + command = sys.argv.pop(1) + if command == Command.API: + run_api() + elif command == Command.CHAT: + run_chat() + elif command == Command.ENV: + print_env() + elif command == Command.EVAL: + run_eval() + elif command == Command.EXPORT: + export_model() + elif command == Command.TRAIN: + force_torchrun = os.environ.get("FORCE_TORCHRUN", "0").lower() in ["true", "1"] + if force_torchrun or get_device_count() > 1: + master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1") + master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999))) + logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port)) + subprocess.run( + ( + "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} " + "--master_addr {master_addr} --master_port {master_port} {file_name} {args}" + ).format( + nnodes=os.environ.get("NNODES", "1"), + node_rank=os.environ.get("RANK", "0"), + nproc_per_node=os.environ.get("NPROC_PER_NODE", str(get_device_count())), + master_addr=master_addr, + master_port=master_port, + file_name=launcher.__file__, + args=" ".join(sys.argv[1:]), + ), + shell=True, + ) + else: + run_exp() + elif command == Command.WEBDEMO: + run_web_demo() + elif command == Command.WEBUI: + run_web_ui() + elif command == Command.VER: + print(WELCOME) + elif command == Command.HELP: + print(USAGE) + else: + raise NotImplementedError("Unknown command: {}".format(command)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/__init__.py new file mode 100644 index 0000000..b08691d --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/__init__.py @@ -0,0 +1,16 @@ +from .collator import KTODataCollatorWithPadding, PairwiseDataCollatorWithPadding +from .data_utils import Role, split_dataset +from .loader import get_dataset +from .template import TEMPLATES, Template, get_template_and_fix_tokenizer + + +__all__ = [ + "KTODataCollatorWithPadding", + "PairwiseDataCollatorWithPadding", + "Role", + "split_dataset", + "get_dataset", + "TEMPLATES", + "Template", + "get_template_and_fix_tokenizer", +] diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/aligner.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/aligner.py new file mode 100644 index 0000000..434956a --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/aligner.py @@ -0,0 +1,221 @@ +import os +from functools import partial +from typing import TYPE_CHECKING, Any, Dict, List, Union + +from datasets import Features + +from ..extras.logging import get_logger +from .data_utils import Role + + +if TYPE_CHECKING: + from datasets import Dataset, IterableDataset + + from ..hparams import DataArguments + from .parser import DatasetAttr + + +logger = get_logger(__name__) + + +def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "DataArguments") -> List[Any]: + r""" + Optionally concatenates image path to dataset dir when loading from local disk. + """ + outputs = [] + if dataset_attr.load_from in ["script", "file"]: + for image in images: + if isinstance(image, str) and os.path.isfile(os.path.join(data_args.dataset_dir, image)): + outputs.append(os.path.join(data_args.dataset_dir, image)) + else: + outputs.append(image) + + return outputs + + +def convert_alpaca( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" +) -> Dict[str, List[Any]]: + r""" + Converts alpaca format dataset to the standard format. + """ + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} + convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args) + for i in range(len(examples[dataset_attr.prompt])): + prompt = [] + if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list): + for old_prompt, old_response in examples[dataset_attr.history][i]: + prompt.append({"role": Role.USER.value, "content": old_prompt}) + prompt.append({"role": Role.ASSISTANT.value, "content": old_response}) + + content = [] + if dataset_attr.prompt and examples[dataset_attr.prompt][i]: + content.append(examples[dataset_attr.prompt][i]) + + if dataset_attr.query and examples[dataset_attr.query][i]: + content.append(examples[dataset_attr.query][i]) + + prompt.append({"role": Role.USER.value, "content": "\n".join(content)}) # "prompt\nquery" + + if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool): # kto example + response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] + if examples[dataset_attr.kto_tag][i]: + response = response + [{"role": Role.ASSISTANT.value, "content": ""}] + else: + response = [{"role": Role.ASSISTANT.value, "content": ""}] + response + elif ( + dataset_attr.ranking + and isinstance(examples[dataset_attr.chosen][i], str) + and isinstance(examples[dataset_attr.rejected][i], str) + ): # pairwise example + response = [ + {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.chosen][i]}, + {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.rejected][i]}, + ] + elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): # normal example + response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] + else: # unsupervised + response = [] + + outputs["prompt"].append(prompt) + outputs["response"].append(response) + outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") + outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") + outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + + return outputs + + +def convert_sharegpt( + examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments" +) -> Dict[str, List[Any]]: + r""" + Converts sharegpt format dataset to the standard format. + """ + outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []} + convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args) + tag_mapping = { + dataset_attr.user_tag: Role.USER.value, + dataset_attr.assistant_tag: Role.ASSISTANT.value, + dataset_attr.observation_tag: Role.OBSERVATION.value, + dataset_attr.function_tag: Role.FUNCTION.value, + dataset_attr.system_tag: Role.SYSTEM.value, + } + odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag) + even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) + accept_tags = (odd_tags, even_tags) + for i, messages in enumerate(examples[dataset_attr.messages]): + if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: + system = messages[0][dataset_attr.content_tag] + messages = messages[1:] + else: + system = examples[dataset_attr.system][i] if dataset_attr.system else "" + + if len(messages) == 0: + continue + + aligned_messages = [] + broken_data = False + for turn_idx, message in enumerate(messages): + if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]: + logger.warning("Invalid role tag in {}.".format(messages)) + broken_data = True + + aligned_messages.append( + {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} + ) + + if (not dataset_attr.ranking and len(aligned_messages) % 2 != 0) or ( + dataset_attr.ranking and len(aligned_messages) % 2 == 0 + ): + logger.warning("Invalid message count in {}.".format(messages)) + broken_data = True + + if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool): # kto example + prompt = aligned_messages[:-1] + response = aligned_messages[-1:] + if examples[dataset_attr.kto_tag][i]: + response = response + [{"role": Role.ASSISTANT.value, "content": ""}] + else: + response = [{"role": Role.ASSISTANT.value, "content": ""}] + response + elif ( + dataset_attr.ranking + and isinstance(examples[dataset_attr.chosen][i], dict) + and isinstance(examples[dataset_attr.rejected][i], dict) + ): # pairwise example + chosen = examples[dataset_attr.chosen][i] + rejected = examples[dataset_attr.rejected][i] + if ( + chosen[dataset_attr.role_tag] not in accept_tags[-1] + or rejected[dataset_attr.role_tag] not in accept_tags[-1] + ): + logger.warning("Invalid role tag in {}.".format([chosen, rejected])) + broken_data = True + + prompt = aligned_messages + response = [ + {"role": tag_mapping[chosen[dataset_attr.role_tag]], "content": chosen[dataset_attr.content_tag]}, + {"role": tag_mapping[rejected[dataset_attr.role_tag]], "content": rejected[dataset_attr.content_tag]}, + ] + else: # normal example + prompt = aligned_messages[:-1] + response = aligned_messages[-1:] + + if broken_data: + logger.warning("Skipping this abnormal example.") + continue + + outputs["prompt"].append(prompt) + outputs["response"].append(response) + outputs["system"].append(system) + outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") + outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else []) + + return outputs + + +def align_dataset( + dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments" +) -> Union["Dataset", "IterableDataset"]: + r""" + Aligned dataset: + prompt: [{"role": "user", "content": "..."}] * (2T - 1) + response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset) + system: "..." + tools: "...", + images: [], + """ + if dataset_attr.formatting == "alpaca": + convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args) + else: + convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args) + + column_names = list(next(iter(dataset)).keys()) + features = Features.from_dict( + { + "prompt": [ + {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} + ], + "response": [ + {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} + ], + "system": {"dtype": "string", "_type": "Value"}, + "tools": {"dtype": "string", "_type": "Value"}, + "images": [{"_type": "Image"}], + } + ) + kwargs = {} + if not data_args.streaming: + kwargs = dict( + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=(not data_args.overwrite_cache), + desc="Converting format of dataset", + ) + + return dataset.map( + convert_func, + batched=True, + remove_columns=column_names, + features=features, + **kwargs, + ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/collator.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/collator.py new file mode 100644 index 0000000..1dc8dd8 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/collator.py @@ -0,0 +1,81 @@ +from dataclasses import dataclass +from typing import Any, Dict, Sequence + +import torch +from transformers import DataCollatorForSeq2Seq + + +@dataclass +class PairwiseDataCollatorWithPadding(DataCollatorForSeq2Seq): + r""" + Data collator for pairwise data. + """ + + def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: + r""" + Pads batched data to the longest sequence in the batch. + + We generate 2 * n examples where the first n examples represent chosen examples and + the last n examples represent rejected examples. + """ + concatenated_features = [] + for key in ("chosen", "rejected"): + for feature in features: + target_feature = { + "input_ids": feature["{}_input_ids".format(key)], + "attention_mask": feature["{}_attention_mask".format(key)], + "labels": feature["{}_labels".format(key)], + } + if "pixel_values" in feature: + target_feature["pixel_values"] = feature["pixel_values"] + + if "{}_token_type_ids".format(key) in feature: + target_feature["token_type_ids"] = feature["{}_token_type_ids".format(key)] + + concatenated_features.append(target_feature) + + return super().__call__(concatenated_features) + + +@dataclass +class KTODataCollatorWithPadding(DataCollatorForSeq2Seq): + r""" + Data collator for KTO data. + """ + + def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: + target_features = [] + kl_features = [] + kto_tags = [] + for feature in features: + target_feature = { + "input_ids": feature["input_ids"], + "attention_mask": feature["attention_mask"], + "labels": feature["labels"], + } + kl_feature = { + "input_ids": feature["kl_input_ids"], + "attention_mask": feature["kl_attention_mask"], + "labels": feature["kl_labels"], + } + if "pixel_values" in feature: + target_feature["pixel_values"] = feature["pixel_values"] + + if "token_type_ids" in feature: + target_feature["token_type_ids"] = feature["token_type_ids"] + kl_feature["token_type_ids"] = feature["kl_token_type_ids"] + + target_features.append(target_feature) + kl_features.append(kl_feature) + kto_tags.append(feature["kto_tags"]) + + batch = super().__call__(target_features) + kl_batch = super().__call__(kl_features) + batch["kl_input_ids"] = kl_batch["input_ids"] + batch["kl_attention_mask"] = kl_batch["attention_mask"] + batch["kl_labels"] = kl_batch["labels"] + if "token_type_ids" in batch: + batch["kl_token_type_ids"] = kl_batch["token_type_ids"] + + batch["kto_tags"] = torch.tensor(kto_tags) + return batch diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/utils.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/data_utils.py similarity index 80% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/data/utils.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/data_utils.py index c0b6d6c..9b31311 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/utils.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/data_utils.py @@ -1,6 +1,5 @@ -import hashlib from enum import Enum, unique -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Tuple, Union from datasets import concatenate_datasets, interleave_datasets @@ -11,7 +10,7 @@ if TYPE_CHECKING: from datasets import Dataset, IterableDataset from transformers import Seq2SeqTrainingArguments - from llmtuner.hparams import DataArguments + from ..hparams import DataArguments logger = get_logger(__name__) @@ -26,25 +25,10 @@ class Role(str, Enum): OBSERVATION = "observation" -def checksum(data_files: List[str], file_sha1: Optional[str] = None) -> None: - if file_sha1 is None: - logger.warning("Checksum failed: missing SHA-1 hash value in dataset_info.json.") - return - - if len(data_files) != 1: - logger.warning("Checksum failed: too many files.") - return - - with open(data_files[0], "rb") as f: - sha1 = hashlib.sha1(f.read()).hexdigest() - if sha1 != file_sha1: - logger.warning("Checksum failed: mismatched SHA-1 hash value at {}.".format(data_files[0])) - - def infer_max_len(source_len: int, target_len: int, max_len: int, reserved_label_len: int) -> Tuple[int, int]: max_target_len = int(max_len * (target_len / (source_len + target_len))) max_target_len = max(max_target_len, reserved_label_len) - max_source_len = max_len - max_target_len + max_source_len = max_len - min(max_target_len, target_len) return max_source_len, max_target_len @@ -78,9 +62,9 @@ def split_dataset( if training_args.do_train: if data_args.val_size > 1e-6: # Split the dataset if data_args.streaming: + dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) val_set = dataset.take(int(data_args.val_size)) train_set = dataset.skip(int(data_args.val_size)) - dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed) return {"train_dataset": train_set, "eval_dataset": val_set} else: val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/formatter.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/formatter.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/data/formatter.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/formatter.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/loader.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/loader.py similarity index 68% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/data/loader.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/loader.py index 935695a..2c236c7 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/loader.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/loader.py @@ -1,21 +1,24 @@ import inspect import os -from typing import TYPE_CHECKING, Literal, Union +import sys +from typing import TYPE_CHECKING, Literal, Optional, Union +import numpy as np from datasets import load_dataset, load_from_disk from ..extras.constants import FILEEXT2TYPE from ..extras.logging import get_logger +from ..extras.misc import has_tokenized_data from .aligner import align_dataset +from .data_utils import merge_dataset from .parser import get_dataset_list from .preprocess import get_preprocess_and_print_func from .template import get_template_and_fix_tokenizer -from .utils import checksum, merge_dataset if TYPE_CHECKING: from datasets import Dataset, IterableDataset - from transformers import Seq2SeqTrainingArguments + from transformers import ProcessorMixin, Seq2SeqTrainingArguments from transformers.tokenization_utils import PreTrainedTokenizer from ..hparams import DataArguments, ModelArguments @@ -56,14 +59,12 @@ def load_single_dataset( data_files.append(local_path) data_path = FILEEXT2TYPE.get(local_path.split(".")[-1], None) else: - raise ValueError("File not found.") + raise ValueError("File {} not found.".format(local_path)) if data_path is None: - raise ValueError("File extension must be txt, csv, json or jsonl.") - - checksum(data_files, dataset_attr.file_sha1) + raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys()))) else: - raise NotImplementedError + raise NotImplementedError("Unknown load type: {}.".format(dataset_attr.load_from)) if dataset_attr.load_from == "ms_hub": try: @@ -80,7 +81,9 @@ def load_single_dataset( cache_dir=cache_dir, token=model_args.ms_hub_token, use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")), - ).to_hf_dataset() + ) + if isinstance(dataset, MsDataset): + dataset = dataset.to_hf_dataset() except ImportError: raise ImportError("Please install modelscope via `pip install modelscope -U`") else: @@ -104,30 +107,43 @@ def load_single_dataset( if data_args.streaming and (dataset_attr.load_from == "file"): # faster than specifying streaming=True dataset = dataset.to_iterable_dataset() # TODO: add num shards parameter + if dataset_attr.num_samples is not None and not data_args.streaming: + target_num = dataset_attr.num_samples + indexes = np.random.permutation(len(dataset))[:target_num] + target_num -= len(indexes) + if target_num > 0: + expand_indexes = np.random.choice(len(dataset), target_num) + indexes = np.concatenate((indexes, expand_indexes), axis=0) + + assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched." + dataset = dataset.select(indexes) + logger.info("Sampled {} examples from dataset {}.".format(dataset_attr.num_samples, dataset_attr)) + if data_args.max_samples is not None: # truncate dataset - num_samples = min(data_args.max_samples, len(dataset)) - dataset = dataset.select(range(num_samples)) + max_samples = min(data_args.max_samples, len(dataset)) + dataset = dataset.select(range(max_samples)) return align_dataset(dataset, dataset_attr, data_args) def get_dataset( - tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], - # split: Optional[str] = "train", # TODO: add split + stage: Literal["pt", "sft", "rm", "ppo", "kto"], + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"] = None, ) -> Union["Dataset", "IterableDataset"]: template = get_template_and_fix_tokenizer(tokenizer, data_args.template) if data_args.train_on_prompt and template.efficient_eos: raise ValueError("Current template does not support `train_on_prompt`.") - # Load from cache - if data_args.cache_path is not None: - if os.path.exists(data_args.cache_path): + # Load tokenized dataset + if data_args.tokenized_path is not None: + if has_tokenized_data(data_args.tokenized_path): logger.warning("Loading dataset from disk will ignore other data arguments.") - dataset = load_from_disk(data_args.cache_path) + dataset = load_from_disk(data_args.tokenized_path) + logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path)) if data_args.streaming: dataset = dataset.to_iterable_dataset() return dataset @@ -138,12 +154,15 @@ def get_dataset( with training_args.main_process_first(desc="load dataset"): all_datasets = [] for dataset_attr in get_dataset_list(data_args): + if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True): + raise ValueError("The dataset is not applicable in the current training stage.") + all_datasets.append(load_single_dataset(dataset_attr, model_args, data_args)) dataset = merge_dataset(all_datasets, data_args, training_args) with training_args.main_process_first(desc="pre-process dataset"): preprocess_func, print_function = get_preprocess_and_print_func( - tokenizer, template, data_args, training_args, stage + data_args, training_args, stage, template, tokenizer, processor ) column_names = list(next(iter(dataset)).keys()) kwargs = {} @@ -156,15 +175,21 @@ def get_dataset( dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs) - if data_args.cache_path is not None and not os.path.exists(data_args.cache_path): + if data_args.tokenized_path is not None: if training_args.should_save: - dataset.save_to_disk(data_args.cache_path) - logger.info("Dataset cache saved at {}.".format(data_args.cache_path)) + dataset.save_to_disk(data_args.tokenized_path) + logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path)) + logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.tokenized_path)) + + sys.exit(0) if training_args.should_log: try: print_function(next(iter(dataset))) except StopIteration: - raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") + if stage == "pt": + raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.") + else: + raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.") return dataset diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/parser.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/parser.py similarity index 72% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/data/parser.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/parser.py index 861396a..ec97bfc 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/parser.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/parser.py @@ -20,23 +20,28 @@ class DatasetAttr: """ basic configs """ load_from: Literal["hf_hub", "ms_hub", "script", "file"] dataset_name: str + formatting: Literal["alpaca", "sharegpt"] = "alpaca" + ranking: bool = False """ extra configs """ - file_sha1: Optional[str] = None subset: Optional[str] = None folder: Optional[str] = None - ranking: bool = False - formatting: Literal["alpaca", "sharegpt"] = "alpaca" - """ columns """ + num_samples: Optional[int] = None + """ common columns """ system: Optional[str] = None - """ columns for the alpaca format """ + tools: Optional[str] = None + images: Optional[str] = None + """ rlhf columns """ + chosen: Optional[str] = None + rejected: Optional[str] = None + kto_tag: Optional[str] = None + """ alpaca columns """ prompt: Optional[str] = "instruction" query: Optional[str] = "input" response: Optional[str] = "output" history: Optional[str] = None - """ columns for the sharegpt format """ + """ sharegpt columns """ messages: Optional[str] = "conversations" - tools: Optional[str] = None - """ tags for the sharegpt format """ + """ sharegpt tags """ role_tag: Optional[str] = "from" content_tag: Optional[str] = "value" user_tag: Optional[str] = "human" @@ -53,22 +58,35 @@ class DatasetAttr: def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: - dataset_names = [ds.strip() for ds in data_args.dataset.split(",")] if data_args.dataset is not None else [] - try: - with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f: - dataset_info = json.load(f) - except Exception as err: - if data_args.dataset is not None: - raise ValueError( - "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)) - ) + if data_args.dataset is not None: + dataset_names = [ds.strip() for ds in data_args.dataset.split(",")] + else: + dataset_names = [] + + if data_args.dataset_dir == "ONLINE": dataset_info = None + else: + try: + with open(os.path.join(data_args.dataset_dir, DATA_CONFIG), "r") as f: + dataset_info = json.load(f) + except Exception as err: + if len(dataset_names) != 0: + raise ValueError( + "Cannot open {} due to {}.".format(os.path.join(data_args.dataset_dir, DATA_CONFIG), str(err)) + ) + dataset_info = None if data_args.interleave_probs is not None: data_args.interleave_probs = [float(prob.strip()) for prob in data_args.interleave_probs.split(",")] dataset_list: List[DatasetAttr] = [] for name in dataset_names: + if dataset_info is None: + load_from = "ms_hub" if use_modelscope() else "hf_hub" + dataset_attr = DatasetAttr(load_from, dataset_name=name) + dataset_list.append(dataset_attr) + continue + if name not in dataset_info: raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG)) @@ -85,18 +103,18 @@ def get_dataset_list(data_args: "DataArguments") -> List["DatasetAttr"]: else: dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"]) - dataset_attr.set_attr("file_sha1", dataset_info[name]) + dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca") + dataset_attr.set_attr("ranking", dataset_info[name], default=False) dataset_attr.set_attr("subset", dataset_info[name]) dataset_attr.set_attr("folder", dataset_info[name]) - dataset_attr.set_attr("ranking", dataset_info[name], default=False) - dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca") + dataset_attr.set_attr("num_samples", dataset_info[name]) if "columns" in dataset_info[name]: - column_names = ["system"] + column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"] if dataset_attr.formatting == "alpaca": column_names.extend(["prompt", "query", "response", "history"]) else: - column_names.extend(["messages", "tools"]) + column_names.extend(["messages"]) for column_name in column_names: dataset_attr.set_attr(column_name, dataset_info[name]["columns"]) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/preprocess.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/preprocess.py new file mode 100644 index 0000000..97789c3 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/preprocess.py @@ -0,0 +1,84 @@ +from functools import partial +from typing import TYPE_CHECKING, Callable, Literal, Optional, Tuple + +from .processors.feedback import preprocess_feedback_dataset +from .processors.pairwise import preprocess_pairwise_dataset, print_pairwise_dataset_example +from .processors.pretrain import preprocess_pretrain_dataset +from .processors.supervised import ( + preprocess_packed_supervised_dataset, + preprocess_supervised_dataset, + print_supervised_dataset_example, +) +from .processors.unsupervised import preprocess_unsupervised_dataset, print_unsupervised_dataset_example + + +if TYPE_CHECKING: + from transformers import ProcessorMixin, Seq2SeqTrainingArguments + from transformers.tokenization_utils import PreTrainedTokenizer + + from ..hparams import DataArguments + from .template import Template + + +def get_preprocess_and_print_func( + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + stage: Literal["pt", "sft", "rm", "ppo", "kto"], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], +) -> Tuple[Callable, Callable]: + if stage == "pt": + preprocess_func = partial( + preprocess_pretrain_dataset, + tokenizer=tokenizer, + data_args=data_args, + ) + print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) + elif stage == "sft" and not training_args.predict_with_generate: + if data_args.packing: + preprocess_func = partial( + preprocess_packed_supervised_dataset, + template=template, + tokenizer=tokenizer, + data_args=data_args, + ) + else: + preprocess_func = partial( + preprocess_supervised_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + + print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) + elif stage == "rm": + preprocess_func = partial( + preprocess_pairwise_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) + elif stage == "kto": + preprocess_func = partial( + preprocess_feedback_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) + else: + preprocess_func = partial( + preprocess_unsupervised_dataset, + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) + + return preprocess_func, print_function diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/feedback.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/feedback.py new file mode 100644 index 0000000..98d8365 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/feedback.py @@ -0,0 +1,126 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple + +from ...extras.constants import IGNORE_INDEX +from ...extras.logging import get_logger +from .processor_utils import get_paligemma_token_type_ids, get_pixel_values + + +if TYPE_CHECKING: + from transformers import ProcessorMixin + from transformers.tokenization_utils import PreTrainedTokenizer + + from ...hparams import DataArguments + from ..template import Template + + +logger = get_logger(__name__) + + +def _encode_feedback_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + kl_response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int], List[int], List[int], bool]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + if response[0]["content"]: # desired example + kto_tag = True + messages = prompt + [response[0]] + else: # undesired example + kto_tag = False + messages = prompt + [response[1]] + + if kl_response[0]["content"]: + kl_messages = prompt + [kl_response[0]] + else: + kl_messages = prompt + [kl_response[1]] + + prompt_ids, response_ids = template.encode_oneturn( + tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + _, kl_response_ids = template.encode_oneturn( + tokenizer, kl_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + + if template.efficient_eos: + response_ids += [tokenizer.eos_token_id] + kl_response_ids += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + + input_ids = prompt_ids + response_ids + labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids + kl_input_ids = prompt_ids + kl_response_ids + kl_labels = [IGNORE_INDEX] * len(prompt_ids) + kl_response_ids + + return input_ids, labels, kl_input_ids, kl_labels, kto_tag + + +def preprocess_feedback_dataset( + examples: Dict[str, List[Any]], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # create unrelated input-output pairs for estimating the KL term by flipping the matched pairs + kl_response = examples["response"][::-1] + model_inputs = { + "input_ids": [], + "attention_mask": [], + "labels": [], + "kl_input_ids": [], + "kl_attention_mask": [], + "kl_labels": [], + "kto_tags": [], + } + if processor is not None: + model_inputs["pixel_values"] = [] + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"] = [] + model_inputs["kl_token_type_ids"] = [] + + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2: + logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) + continue + + input_ids, labels, kl_input_ids, kl_labels, kto_tag = _encode_feedback_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + kl_response=kl_response[i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + model_inputs["kl_input_ids"].append(kl_input_ids) + model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids)) + model_inputs["kl_labels"].append(kl_labels) + model_inputs["kto_tags"].append(kto_tag) + if processor is not None: + model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) + model_inputs["kl_token_type_ids"].append(get_paligemma_token_type_ids(len(kl_input_ids), processor)) + + desirable_num = sum([1 for tag in model_inputs["kto_tags"] if tag]) + undesirable_num = len(model_inputs["kto_tags"]) - desirable_num + if desirable_num == 0 or undesirable_num == 0: + logger.warning("Your dataset only has one preference type.") + + return model_inputs diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pairwise.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pairwise.py new file mode 100644 index 0000000..fe984ef --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pairwise.py @@ -0,0 +1,123 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple + +from ...extras.constants import IGNORE_INDEX +from ...extras.logging import get_logger +from .processor_utils import get_paligemma_token_type_ids, get_pixel_values + + +if TYPE_CHECKING: + from transformers import ProcessorMixin + from transformers.tokenization_utils import PreTrainedTokenizer + + from ...hparams import DataArguments + from ..template import Template + + +logger = get_logger(__name__) + + +def _encode_pairwise_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int], List[int], List[int]]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + chosen_messages = prompt + [response[0]] + rejected_messages = prompt + [response[1]] + prompt_ids, chosen_ids = template.encode_oneturn( + tokenizer, chosen_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + _, rejected_ids = template.encode_oneturn( + tokenizer, rejected_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + + if template.efficient_eos: + chosen_ids += [tokenizer.eos_token_id] + rejected_ids += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + + chosen_input_ids = prompt_ids + chosen_ids + chosen_labels = [IGNORE_INDEX] * len(prompt_ids) + chosen_ids + rejected_input_ids = prompt_ids + rejected_ids + rejected_labels = [IGNORE_INDEX] * len(prompt_ids) + rejected_ids + + return chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels + + +def preprocess_pairwise_dataset( + examples: Dict[str, List[Any]], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # build input pairs with format ` X`, `Y1 ` and `Y2 ` + model_inputs = { + "chosen_input_ids": [], + "chosen_attention_mask": [], + "chosen_labels": [], + "rejected_input_ids": [], + "rejected_attention_mask": [], + "rejected_labels": [], + } + if processor is not None: + model_inputs["pixel_values"] = [] + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["chosen_token_type_ids"] = [] + model_inputs["rejected_token_type_ids"] = [] + + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2: + logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) + continue + + chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = _encode_pairwise_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + model_inputs["chosen_input_ids"].append(chosen_input_ids) + model_inputs["chosen_attention_mask"].append([1] * len(chosen_input_ids)) + model_inputs["chosen_labels"].append(chosen_labels) + model_inputs["rejected_input_ids"].append(rejected_input_ids) + model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids)) + model_inputs["rejected_labels"].append(rejected_labels) + if processor is not None: + model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["chosen_token_type_ids"].append( + get_paligemma_token_type_ids(len(chosen_input_ids), processor) + ) + model_inputs["rejected_token_type_ids"].append( + get_paligemma_token_type_ids(len(rejected_input_ids), processor) + ) + + return model_inputs + + +def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: + valid_chosen_labels = list(filter(lambda x: x != IGNORE_INDEX, example["chosen_labels"])) + valid_rejected_labels = list(filter(lambda x: x != IGNORE_INDEX, example["rejected_labels"])) + print("chosen_input_ids:\n{}".format(example["chosen_input_ids"])) + print("chosen_inputs:\n{}".format(tokenizer.decode(example["chosen_input_ids"], skip_special_tokens=False))) + print("chosen_label_ids:\n{}".format(example["chosen_labels"])) + print("chosen_labels:\n{}".format(tokenizer.decode(valid_chosen_labels, skip_special_tokens=False))) + print("rejected_input_ids:\n{}".format(example["rejected_input_ids"])) + print("rejected_inputs:\n{}".format(tokenizer.decode(example["rejected_input_ids"], skip_special_tokens=False))) + print("rejected_label_ids:\n{}".format(example["rejected_labels"])) + print("rejected_labels:\n{}".format(tokenizer.decode(valid_rejected_labels, skip_special_tokens=False))) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pretrain.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pretrain.py new file mode 100644 index 0000000..87727b5 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/pretrain.py @@ -0,0 +1,36 @@ +from itertools import chain +from typing import TYPE_CHECKING, Any, Dict, List + + +if TYPE_CHECKING: + from transformers.tokenization_utils import PreTrainedTokenizer + + from ...hparams import DataArguments + + +def preprocess_pretrain_dataset( + examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" +) -> Dict[str, List[List[int]]]: + # build grouped texts with format `X1 X2 X3 ...` if packing is enabled + text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] + + if not data_args.packing: + if data_args.template == "gemma": + text_examples = [tokenizer.bos_token + example for example in text_examples] + + result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len, truncation=True) + else: + tokenized_examples = tokenizer(text_examples, add_special_tokens=False) + concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} + total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) + block_size = data_args.cutoff_len + total_length = (total_length // block_size) * block_size + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + if data_args.template == "gemma": + for i in range(len(result["input_ids"])): + result["input_ids"][i][0] = tokenizer.bos_token_id + + return result diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/processor_utils.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/processor_utils.py new file mode 100644 index 0000000..9903a05 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/processor_utils.py @@ -0,0 +1,64 @@ +import bisect +from typing import TYPE_CHECKING, List, Sequence + +from ...extras.packages import is_pillow_available + + +if is_pillow_available(): + from PIL import Image + + +if TYPE_CHECKING: + from numpy.typing import NDArray + from PIL.Image import Image as ImageObject + from transformers import ProcessorMixin + from transformers.image_processing_utils import BaseImageProcessor + + +def search_for_fit(numbers: Sequence[int], capacity: int) -> int: + r""" + Finds the index of largest number that fits into the knapsack with the given capacity. + """ + index = bisect.bisect(numbers, capacity) + return -1 if index == 0 else (index - 1) + + +def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]: + r""" + An efficient greedy algorithm with binary search for the knapsack problem. + """ + numbers.sort() # sort numbers in ascending order for binary search + knapsacks = [] + + while numbers: + current_knapsack = [] + remaining_capacity = capacity + + while True: + index = search_for_fit(numbers, remaining_capacity) + if index == -1: + break # no more numbers fit in this knapsack + + remaining_capacity -= numbers[index] # update the remaining capacity + current_knapsack.append(numbers.pop(index)) # add the number to knapsack + + knapsacks.append(current_knapsack) + + return knapsacks + + +def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray": + r""" + Processes visual inputs. (currently only supports a single image) + """ + image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") + image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) + return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) + + +def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[int]: + r""" + Gets paligemma token type ids for computing loss. + """ + image_seq_length = getattr(processor, "image_seq_length") + return [0] * image_seq_length + [1] * (input_len - image_seq_length) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/supervised.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/supervised.py new file mode 100644 index 0000000..19d6028 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/supervised.py @@ -0,0 +1,169 @@ +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple + +from ...extras.constants import IGNORE_INDEX +from ...extras.logging import get_logger +from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack + + +if TYPE_CHECKING: + from transformers import ProcessorMixin + from transformers.tokenization_utils import PreTrainedTokenizer + + from ...hparams import DataArguments + from ..template import Template + + +logger = get_logger(__name__) + + +def _encode_supervised_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int]]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + messages = prompt + response + input_ids, labels = [], [] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + input_ids += [image_token_id] * getattr(processor, "image_seq_length") + labels += [IGNORE_INDEX] * getattr(processor, "image_seq_length") + + encoded_pairs = template.encode_multiturn( + tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs): + if data_args.train_on_prompt: + source_mask = source_ids + elif turn_idx != 0 and template.efficient_eos: + source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) + else: + source_mask = [IGNORE_INDEX] * len(source_ids) + + input_ids += source_ids + target_ids + labels += source_mask + target_ids + + if template.efficient_eos: + input_ids += [tokenizer.eos_token_id] + labels += [tokenizer.eos_token_id] + + return input_ids, labels + + +def preprocess_supervised_dataset( + examples: Dict[str, List[Any]], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # build inputs with format ` X Y ` and labels with format ` ... Y ` + # for multiturn examples, we only mask the prompt part in each prompt-response pair. + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + if processor is not None: + model_inputs["pixel_values"] = [] + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"] = [] + + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: + logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) + continue + + input_ids, labels = _encode_supervised_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + if processor is not None: + model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) + + return model_inputs + + +def preprocess_packed_supervised_dataset( + examples: Dict[str, List[Any]], + template: "Template", + tokenizer: "PreTrainedTokenizer", + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # build inputs with format ` X1 Y1 X2 Y2 ` + # and labels with format ` ... Y1 ... Y2 ` + valid_num = 0 + batch_input_ids, batch_labels = [], [] + lengths = [] + length2indexes = defaultdict(list) + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: + logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) + continue + + input_ids, labels = _encode_supervised_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=None, + data_args=data_args, + ) + length = len(input_ids) + if length > data_args.cutoff_len: + logger.warning("Dropped lengthy example with length {} > {}.".format(length, data_args.cutoff_len)) + else: + lengths.append(length) + length2indexes[length].append(valid_num) + batch_input_ids.append(input_ids) + batch_labels.append(labels) + valid_num += 1 + + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + knapsacks = greedy_knapsack(lengths, data_args.cutoff_len) + for knapsack in knapsacks: + packed_input_ids, packed_labels = [], [] + for length in knapsack: + index = length2indexes[length].pop() + packed_input_ids += batch_input_ids[index] + packed_labels += batch_labels[index] + + if len(packed_input_ids) < data_args.cutoff_len: + pad_length = data_args.cutoff_len - len(packed_input_ids) + packed_input_ids += [tokenizer.pad_token_id] * pad_length + packed_labels += [IGNORE_INDEX] * pad_length + + if len(packed_input_ids) != data_args.cutoff_len: + raise ValueError("The length of packed example should be identical to the cutoff length.") + + model_inputs["input_ids"].append(packed_input_ids) + model_inputs["attention_mask"].append([1] * data_args.cutoff_len) + model_inputs["labels"].append(packed_labels) + + return model_inputs + + +def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: + valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"])) + print("input_ids:\n{}".format(example["input_ids"])) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) + print("label_ids:\n{}".format(example["labels"])) + print("labels:\n{}".format(tokenizer.decode(valid_labels, skip_special_tokens=False))) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/unsupervised.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/unsupervised.py new file mode 100644 index 0000000..f711eea --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/processors/unsupervised.py @@ -0,0 +1,92 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple + +from ...extras.logging import get_logger +from ..data_utils import Role +from .processor_utils import get_paligemma_token_type_ids, get_pixel_values + + +if TYPE_CHECKING: + from transformers import ProcessorMixin + from transformers.tokenization_utils import PreTrainedTokenizer + + from ...hparams import DataArguments + from ..template import Template + + +logger = get_logger(__name__) + + +def _encode_unsupervised_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int]]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + if len(response) == 1: + messages = prompt + response + else: + messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}] + + input_ids, labels = template.encode_oneturn( + tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + if template.efficient_eos: + labels += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids + + return input_ids, labels + + +def preprocess_unsupervised_dataset( + examples: Dict[str, List[Any]], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Dict[str, List[List[int]]]: + # build inputs with format ` X` and labels with format `Y ` + model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} + if processor is not None: + model_inputs["pixel_values"] = [] + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"] = [] + + for i in range(len(examples["prompt"])): + if len(examples["prompt"][i]) % 2 != 1: + logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) + continue + + input_ids, labels = _encode_unsupervised_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, + ) + model_inputs["input_ids"].append(input_ids) + model_inputs["attention_mask"].append([1] * len(input_ids)) + model_inputs["labels"].append(labels) + if processor is not None: + model_inputs["pixel_values"].append(get_pixel_values(examples["images"][i], processor)) + if hasattr(processor, "image_seq_length"): # paligemma models + model_inputs["token_type_ids"].append(get_paligemma_token_type_ids(len(input_ids), processor)) + + return model_inputs + + +def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: + print("input_ids:\n{}".format(example["input_ids"])) + print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/template.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/template.py similarity index 77% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/data/template.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/data/template.py index af80272..b600c56 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/template.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/data/template.py @@ -2,8 +2,8 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union from ..extras.logging import get_logger +from .data_utils import Role, infer_max_len from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter -from .utils import Role, infer_max_len if TYPE_CHECKING: @@ -26,6 +26,7 @@ class Template: format_separator: "Formatter" default_system: str stop_words: List[str] + image_token: str efficient_eos: bool replace_eos: bool force_system: bool @@ -68,8 +69,8 @@ class Template: self, tokenizer: "PreTrainedTokenizer", messages: List[Dict[str, str]], - system: str, - tools: str, + system: Optional[str], + tools: Optional[str], cutoff_len: int, reserved_label_len: int, ) -> Sequence[Tuple[List[int], List[int]]]: @@ -195,7 +196,7 @@ class Llama2Template(Template): return self._make_pairs(encoded_messages, cutoff_len, reserved_label_len) -templates: Dict[str, Template] = {} +TEMPLATES: Dict[str, Template] = {} def _register_template( @@ -209,6 +210,7 @@ def _register_template( format_separator: Optional["Formatter"] = None, default_system: str = "", stop_words: List[str] = [], + image_token: str = "", efficient_eos: bool = False, replace_eos: bool = False, force_system: bool = False, @@ -246,7 +248,7 @@ def _register_template( default_function_formatter = FunctionFormatter(slots=["Action: {{name}}\nAction Input: {{arguments}}"] + eos_slots) default_tool_formatter = ToolFormatter(tool_format="default") default_separator_formatter = EmptyFormatter() - templates[name] = template_class( + TEMPLATES[name] = template_class( format_user=format_user or default_user_formatter, format_assistant=format_assistant or default_assistant_formatter, format_system=format_system or default_user_formatter, @@ -256,6 +258,7 @@ def _register_template( format_separator=format_separator or default_separator_formatter, default_system=default_system, stop_words=stop_words, + image_token=image_token, efficient_eos=efficient_eos, replace_eos=replace_eos, force_system=force_system, @@ -276,7 +279,7 @@ def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) def _jinja_escape(content: str) -> str: - return content.replace("\n", r"\n").replace("'", r"\'") + return content.replace("'", r"\'") def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str: @@ -290,10 +293,10 @@ def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", pl slot_items.append(placeholder) if slot_pieces[1]: slot_items.append("'" + _jinja_escape(slot_pieces[1]) + "'") - elif isinstance(slot, set): - if "bos_token" in slot: + elif isinstance(slot, set): # do not use {{ eos_token }} since it may be replaced + if "bos_token" in slot and tokenizer.bos_token_id is not None: slot_items.append("'" + tokenizer.bos_token + "'") - elif "eos_token" in slot: # do not use {{ eos_token }} since it may be replaced + elif "eos_token" in slot and tokenizer.eos_token_id is not None: slot_items.append("'" + tokenizer.eos_token + "'") elif isinstance(slot, dict): raise ValueError("Dict is not supported.") @@ -308,7 +311,7 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}" jinja_template += ( - "{% if messages[0]['role'] == 'system' %}" "{% set system_message = messages[0]['content'] %}" "{% endif %}" + "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}" ) system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message") @@ -325,9 +328,11 @@ def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") jinja_template += "{% if loop.index0 == 0 and system_message is defined %}" jinja_template += "{% set content = " + system_message + " + message['content'] %}" jinja_template += "{% endif %}" + jinja_template += "{% if message['role'] == 'user' %}" user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer) jinja_template += "{{ " + user_message + " }}" + jinja_template += "{% elif message['role'] == 'assistant' %}" assistant_message = _convert_slots_to_jinja( template.format_assistant.apply() + template.format_separator.apply(), tokenizer @@ -343,9 +348,9 @@ def get_template_and_fix_tokenizer( name: Optional[str] = None, ) -> Template: if name is None: - template = templates["vanilla"] # placeholder + template = TEMPLATES["empty"] # placeholder else: - template = templates.get(name, None) + template = TEMPLATES.get(name, None) if template is None: raise ValueError("Template {} does not exist.".format(name)) @@ -385,7 +390,8 @@ _register_template( format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]), format_separator=EmptyFormatter(slots=["\n\n"]), default_system=( - "Below is an instruction that describes a task. " "Write a response that appropriately completes the request." + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" ), ) @@ -414,7 +420,7 @@ _register_template( _register_template( name="baichuan", - format_user=StringFormatter(slots=["{{content}}"]), + format_user=StringFormatter(slots=[{"token": ""}, "{{content}}", {"token": ""}]), efficient_eos=True, ) @@ -441,6 +447,18 @@ _register_template( ) +_register_template( + name="breeze", + format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]), + format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), + default_system=( + "You are a helpful AI assistant built by MediaTek Research. " + "The user you are helping speaks Traditional Chinese and comes from Taiwan." + ), + efficient_eos=True, +) + + _register_template( name="chatglm2", format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]), @@ -490,6 +508,7 @@ _register_template( name="chatml", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>", "<|im_start|>"], replace_eos=True, @@ -500,6 +519,7 @@ _register_template( name="chatml_de", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.", stop_words=["<|im_end|>", "<|im_start|>"], @@ -514,6 +534,26 @@ _register_template( ) +_register_template( + name="cohere", + format_user=StringFormatter( + slots=[ + ( + "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>" + "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + ) + ] + ), + format_system=StringFormatter( + slots=[{"bos_token"}, "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"] + ), + default_system=( + "You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users " + "by providing thorough responses. You are trained by Cohere." + ), +) + + _register_template( name="cpm", format_user=StringFormatter(slots=["<用户>{{content}}"]), @@ -522,6 +562,32 @@ _register_template( ) +_register_template( + name="dbrx", + format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + default_system=( + "You are DBRX, created by Databricks. You were last updated in December 2023. " + "You answer questions based on information available up to that point.\n" + "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough " + "responses to more complex and open-ended questions.\nYou assist with various tasks, " + "from writing to coding (using markdown for code blocks — remember to use ``` with " + "code, JSON, and tables).\n(You do not have real-time data access or code execution " + "capabilities. You avoid stereotyping and provide balanced perspectives on " + "controversial topics. You do not provide song lyrics, poems, or news articles and " + "do not divulge details of your training data.)\nThis is your system prompt, " + "guiding your responses. Do not reference it, just respond to the user. If you find " + "yourself talking about this message, stop. You should be responding appropriately " + "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION " + "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY." + ), + stop_words=["<|im_end|>"], + replace_eos=True, +) + + _register_template( name="deepseek", format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]), @@ -554,6 +620,16 @@ _register_template( ) +_register_template( + name="empty", + format_user=StringFormatter(slots=["{{content}}"]), + format_assistant=StringFormatter(slots=["{{content}}"]), + format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), + efficient_eos=True, + force_system=True, +) + + _register_template( name="falcon", format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]), @@ -562,16 +638,39 @@ _register_template( ) +_register_template( + name="fewshot", + format_separator=EmptyFormatter(slots=["\n\n"]), + efficient_eos=True, +) + + _register_template( name="gemma", format_user=StringFormatter(slots=["user\n{{content}}\nmodel\n"]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), + format_observation=StringFormatter( + slots=["tool\n{{content}}\nmodel\n"] + ), format_separator=EmptyFormatter(slots=["\n"]), efficient_eos=True, force_system=True, ) +_register_template( + name="glm4", + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), + format_assistant=StringFormatter(slots=["\n{{content}}"]), + format_system=StringFormatter(slots=["[gMASK]{{content}}"]), + format_function=FunctionFormatter(slots=["{{name}}\n{{arguments}}"]), + format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]), + stop_words=["<|user|>", "<|observation|>"], + efficient_eos=True, + force_system=True, +) + + _register_template( name="intern", format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": ""}, "\n<|Bot|>:"]), @@ -601,17 +700,8 @@ _register_template( _register_template( name="llama2", format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]), + format_assistant=StringFormatter(slots=[" {{content}} ", {"eos_token"}]), format_system=StringFormatter(slots=["<>\n{{content}}\n<>\n\n"]), - default_system=( - "You are a helpful, respectful and honest assistant. " - "Always answer as helpfully as possible, while being safe. " - "Your answers should not include any harmful, unethical, " - "racist, sexist, toxic, dangerous, or illegal content. " - "Please ensure that your responses are socially unbiased and positive in nature.\n\n" - "If a question does not make any sense, or is not factually coherent, " - "explain why instead of answering something not correct. " - "If you don't know the answer to a question, please don't share false information." - ), ) @@ -623,6 +713,33 @@ _register_template( ) +_register_template( + name="llama3", + format_user=StringFormatter( + slots=[ + ( + "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + ) + ] + ), + format_system=StringFormatter( + slots=[{"bos_token"}, "<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"] + ), + format_observation=StringFormatter( + slots=[ + ( + "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>" + "<|start_header_id|>assistant<|end_header_id|>\n\n" + ) + ] + ), + default_system="You are a helpful assistant.", + stop_words=["<|eot_id|>"], + replace_eos=True, +) + + _register_template( name="mistral", format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]), @@ -633,8 +750,7 @@ _register_template( _register_template( name="olmo", - format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]), - format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]), format_system=StringFormatter(slots=[{"eos_token"}, "{{content}}"]), force_system=True, ) @@ -643,12 +759,28 @@ _register_template( _register_template( name="openchat", format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]), - format_assistant=StringFormatter(slots=["{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), force_system=True, ) +_register_template( + name="openchat-3.6", + format_user=StringFormatter( + slots=[ + ( + "<|start_header_id|>GPT4 Correct User<|end_header_id|>\n\n{{content}}<|eot_id|>" + "<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n" + ) + ] + ), + format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]), + stop_words=["<|eot_id|>"], + replace_eos=True, + force_system=True, +) + + _register_template( name="orion", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]), @@ -657,10 +789,22 @@ _register_template( ) +_register_template( + name="phi", + format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]), + format_system=StringFormatter(slots=[{"bos_token"}, "<|system|>\n{{content}}<|end|>\n"]), + format_separator=EmptyFormatter(slots=["\n"]), + default_system="You are a helpful AI assistant.", + stop_words=["<|end|>"], + replace_eos=True, +) + + _register_template( name="qwen", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), + format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), format_separator=EmptyFormatter(slots=["\n"]), default_system="You are a helpful assistant.", stop_words=["<|im_end|>"], @@ -688,7 +832,11 @@ _register_template( _register_template( - name="vanilla", + name="telechat", + format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]), + format_system=StringFormatter(slots=["<_system>{{content}}<_end>"]), + stop_words=["<_end>"], + replace_eos=True, ) @@ -742,12 +890,29 @@ _register_template( _register_template( name="yi", format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]), + format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]), format_separator=EmptyFormatter(slots=["\n"]), stop_words=["<|im_end|>"], replace_eos=True, ) +_register_template( + name="yi_vl", + format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]), + format_separator=EmptyFormatter(slots=["\n"]), + default_system=( + "This is a chat between an inquisitive human and an AI assistant. " + "Assume the role of the AI assistant. Read all the images carefully, " + "and respond to the human's questions with informative, helpful, detailed and polite answers. " + "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。" + "仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n" + ), + stop_words=["###"], + efficient_eos=True, +) + + _register_template( name="yuan", format_user=StringFormatter(slots=["{{content}}", {"token": ""}]), @@ -762,7 +927,7 @@ _register_template( format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]), format_assistant=StringFormatter(slots=["\n{{content}}", {"eos_token"}]), format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]), - default_system="You are a friendly chatbot who always responds in the style of a pirate", + default_system="You are Zephyr, a helpful assistant.", ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/evaluator.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/evaluator.py similarity index 95% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/evaluator.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/evaluator.py index 4969561..192f481 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/evaluator.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/evaluator.py @@ -14,16 +14,17 @@ from transformers.utils import cached_file from ..data import get_template_and_fix_tokenizer from ..extras.constants import CHOICES, SUBJECTS from ..hparams import get_eval_args -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .template import get_eval_template class Evaluator: def __init__(self, args: Optional[Dict[str, Any]] = None) -> None: self.model_args, self.data_args, self.eval_args, finetuning_args = get_eval_args(args) - self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args) + self.tokenizer = load_tokenizer(self.model_args)["tokenizer"] self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2 self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template) + self.model = load_model(self.tokenizer, self.model_args, finetuning_args) self.eval_template = get_eval_template(self.eval_args.lang) self.choice_inputs = [ self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES @@ -117,6 +118,5 @@ class Evaluator: f.write(score_info) -if __name__ == "__main__": - evaluator = Evaluator() - evaluator.eval() +def run_eval() -> None: + Evaluator().eval() diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/template.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/template.py similarity index 57% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/template.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/template.py index b17f708..a4a6ef0 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/template.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/eval/template.py @@ -1,14 +1,10 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Tuple +from typing import Dict, List, Sequence, Tuple from ..data import Role from ..extras.constants import CHOICES -if TYPE_CHECKING: - from datasets import Dataset - - @dataclass class EvalTemplate: system: str @@ -16,22 +12,29 @@ class EvalTemplate: answer: str prefix: str - def parse_example(self, example: Dict[str, str]) -> Tuple[str, str]: + def _parse_example(self, example: Dict[str, str]) -> Tuple[str, str]: + r""" + input: a dict with keys {"question", "A", "B", "C", "D", "answer"} + output: a tuple of (prompt, response) + """ candidates = [self.choice.format(choice=ch, content=example[ch]) for ch in CHOICES if ch in example] return "".join([example["question"]] + candidates + [self.answer]), example["answer"] def format_example( - self, target_data: Dict[str, str], support_set: "Dataset", subject_name: str + self, target_data: Dict[str, str], support_set: Sequence[Dict[str, str]], subject_name: str ) -> List[Dict[str, str]]: + r""" + Converts dataset examples to messages. + """ messages = [] for k in range(len(support_set)): - prompt, response = self.parse_example(support_set[k]) - messages.append({"role": Role.USER, "content": prompt}) - messages.append({"role": Role.ASSISTANT, "content": response}) + prompt, response = self._parse_example(support_set[k]) + messages.append({"role": Role.USER.value, "content": prompt}) + messages.append({"role": Role.ASSISTANT.value, "content": response}) - prompt, response = self.parse_example(target_data) - messages.append({"role": Role.USER, "content": prompt}) - messages.append({"role": Role.ASSISTANT, "content": response}) + prompt, response = self._parse_example(target_data) + messages.append({"role": Role.USER.value, "content": prompt}) + messages.append({"role": Role.ASSISTANT.value, "content": response}) messages[0]["content"] = self.system.format(subject=subject_name) + messages[0]["content"] return messages @@ -39,7 +42,7 @@ class EvalTemplate: eval_templates: Dict[str, "EvalTemplate"] = {} -def register_eval_template(name: str, system: str, choice: str, answer: str, prefix: str) -> None: +def _register_eval_template(name: str, system: str, choice: str, answer: str, prefix: str) -> None: eval_templates[name] = EvalTemplate(system=system, choice=choice, answer=answer, prefix=prefix) @@ -49,7 +52,7 @@ def get_eval_template(name: str) -> "EvalTemplate": return eval_template -register_eval_template( +_register_eval_template( name="en", system="The following are multiple choice questions (with answers) about {subject}.\n\n", choice="\n{choice}. {content}", @@ -58,10 +61,10 @@ register_eval_template( ) -register_eval_template( +_register_eval_template( name="zh", system="以下是中国关于{subject}考试的单项选择题,请选出其中的正确答案。\n\n", choice="\n{choice}. {content}", answer="\n答案:", - prefix="\n", + prefix=" ", ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/callbacks.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/callbacks.py new file mode 100644 index 0000000..441ebbf --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/callbacks.py @@ -0,0 +1,217 @@ +import json +import logging +import os +import signal +import sys +import time +from concurrent.futures import ThreadPoolExecutor +from datetime import timedelta +from typing import TYPE_CHECKING, Any, Dict, Optional + +import transformers +from transformers import TrainerCallback +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length + +from .constants import TRAINER_LOG +from .logging import LoggerHandler, get_logger +from .misc import fix_valuehead_checkpoint + + +if TYPE_CHECKING: + from transformers import TrainerControl, TrainerState, TrainingArguments + + +logger = get_logger(__name__) + + +class FixValueHeadModelCallback(TrainerCallback): + def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after a checkpoint save. + """ + if args.should_save: + fix_valuehead_checkpoint( + model=kwargs.pop("model"), + output_dir=os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step)), + safe_serialization=args.save_safetensors, + ) + + +class LogCallback(TrainerCallback): + def __init__(self, output_dir: str) -> None: + r""" + Initializes a callback for logging training and evaluation status. + """ + """ Progress """ + self.start_time = 0 + self.cur_steps = 0 + self.max_steps = 0 + self.elapsed_time = "" + self.remaining_time = "" + self.thread_pool: Optional["ThreadPoolExecutor"] = None + """ Status """ + self.aborted = False + self.do_train = False + """ Web UI """ + self.webui_mode = os.environ.get("LLAMABOARD_ENABLED", "0").lower() in ["true", "1"] + if self.webui_mode: + signal.signal(signal.SIGABRT, self._set_abort) + self.logger_handler = LoggerHandler(output_dir) + logging.root.addHandler(self.logger_handler) + transformers.logging.add_handler(self.logger_handler) + + def _set_abort(self, signum, frame) -> None: + self.aborted = True + + def _reset(self, max_steps: int = 0) -> None: + self.start_time = time.time() + self.cur_steps = 0 + self.max_steps = max_steps + self.elapsed_time = "" + self.remaining_time = "" + + def _timing(self, cur_steps: int) -> None: + cur_time = time.time() + elapsed_time = cur_time - self.start_time + avg_time_per_step = elapsed_time / cur_steps if cur_steps != 0 else 0 + remaining_time = (self.max_steps - cur_steps) * avg_time_per_step + self.cur_steps = cur_steps + self.elapsed_time = str(timedelta(seconds=int(elapsed_time))) + self.remaining_time = str(timedelta(seconds=int(remaining_time))) + + def _write_log(self, output_dir: str, logs: Dict[str, Any]) -> None: + with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f: + f.write(json.dumps(logs) + "\n") + + def _create_thread_pool(self, output_dir: str) -> None: + os.makedirs(output_dir, exist_ok=True) + self.thread_pool = ThreadPoolExecutor(max_workers=1) + + def _close_thread_pool(self) -> None: + if self.thread_pool is not None: + self.thread_pool.shutdown(wait=True) + self.thread_pool = None + + def on_init_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of the initialization of the `Trainer`. + """ + if ( + args.should_save + and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG)) + and args.overwrite_output_dir + ): + logger.warning("Previous trainer log in this folder will be deleted.") + os.remove(os.path.join(args.output_dir, TRAINER_LOG)) + + def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the beginning of training. + """ + if args.should_save: + self.do_train = True + self._reset(max_steps=state.max_steps) + self._create_thread_pool(output_dir=args.output_dir) + + def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of training. + """ + self._close_thread_pool() + + def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of an substep during gradient accumulation. + """ + if self.aborted: + control.should_epoch_stop = True + control.should_training_stop = True + + def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called at the end of a training step. + """ + if self.aborted: + control.should_epoch_stop = True + control.should_training_stop = True + + def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after an evaluation phase. + """ + if not self.do_train: + self._close_thread_pool() + + def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after a successful prediction. + """ + if not self.do_train: + self._close_thread_pool() + + def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): + r""" + Event called after logging the last logs. + """ + if not args.should_save: + return + + self._timing(cur_steps=state.global_step) + logs = dict( + current_steps=self.cur_steps, + total_steps=self.max_steps, + loss=state.log_history[-1].get("loss", None), + eval_loss=state.log_history[-1].get("eval_loss", None), + predict_loss=state.log_history[-1].get("predict_loss", None), + reward=state.log_history[-1].get("reward", None), + accuracy=state.log_history[-1].get("rewards/accuracies", None), + learning_rate=state.log_history[-1].get("learning_rate", None), + epoch=state.log_history[-1].get("epoch", None), + percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100, + elapsed_time=self.elapsed_time, + remaining_time=self.remaining_time, + throughput="{:.2f}".format(state.num_input_tokens_seen / (time.time() - self.start_time)), + total_tokens=state.num_input_tokens_seen, + ) + logs = {k: v for k, v in logs.items() if v is not None} + if self.webui_mode and all(key in logs for key in ["loss", "learning_rate", "epoch"]): + logger.info( + "{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}, 'throughput': {}}}".format( + logs["loss"], logs["learning_rate"], logs["epoch"], logs["throughput"] + ) + ) + + if self.thread_pool is not None: + self.thread_pool.submit(self._write_log, args.output_dir, logs) + + def on_prediction_step( + self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs + ): + r""" + Event called after a prediction step. + """ + if self.do_train: + return + + if self.aborted: + sys.exit(0) + + if not args.should_save: + return + + eval_dataloader = kwargs.pop("eval_dataloader", None) + if has_length(eval_dataloader): + if self.max_steps == 0: + self._reset(max_steps=len(eval_dataloader)) + self._create_thread_pool(output_dir=args.output_dir) + + self._timing(cur_steps=self.cur_steps + 1) + if self.cur_steps % 5 == 0 and self.thread_pool is not None: + logs = dict( + current_steps=self.cur_steps, + total_steps=self.max_steps, + percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100, + elapsed_time=self.elapsed_time, + remaining_time=self.remaining_time, + ) + self.thread_pool.submit(self._write_log, args.output_dir, logs) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/constants.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/constants.py similarity index 58% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/constants.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/constants.py index 12ba8b2..7d96fb5 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/constants.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/constants.py @@ -2,13 +2,24 @@ from collections import OrderedDict, defaultdict from enum import Enum from typing import Dict, Optional +from peft.utils import SAFETENSORS_WEIGHTS_NAME as SAFE_ADAPTER_WEIGHTS_NAME +from peft.utils import WEIGHTS_NAME as ADAPTER_WEIGHTS_NAME +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME + + +CHECKPOINT_NAMES = { + SAFE_ADAPTER_WEIGHTS_NAME, + ADAPTER_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, +} CHOICES = ["A", "B", "C", "D"] DATA_CONFIG = "dataset_info.json" -DEFAULT_MODULE = defaultdict(str) - DEFAULT_TEMPLATE = defaultdict(str) FILEEXT2TYPE = { @@ -24,28 +35,43 @@ IGNORE_INDEX = -100 LAYERNORM_NAMES = {"norm", "ln"} -LOG_FILE_NAME = "trainer_log.jsonl" +LLAMABOARD_CONFIG = "llamaboard_config.yaml" METHODS = ["full", "freeze", "lora"] -PEFT_METHODS = ["lora"] +MOD_SUPPORTED_MODELS = {"bloom", "falcon", "gemma", "llama", "mistral", "mixtral", "phi", "starcoder2"} + +PEFT_METHODS = {"lora"} + +RUNNING_LOG = "running_log.txt" SUBJECTS = ["Average", "STEM", "Social Sciences", "Humanities", "Other"] SUPPORTED_MODELS = OrderedDict() +TRAINER_LOG = "trainer_log.jsonl" + +TRAINING_ARGS = "training_args.yaml" + TRAINING_STAGES = { "Supervised Fine-Tuning": "sft", "Reward Modeling": "rm", "PPO": "ppo", "DPO": "dpo", + "KTO": "kto", "Pre-Training": "pt", } +STAGES_USE_PAIR_DATA = {"rm", "dpo"} + +SUPPORTED_CLASS_FOR_S2ATTN = {"llama"} + V_HEAD_WEIGHTS_NAME = "value_head.bin" V_HEAD_SAFE_WEIGHTS_NAME = "value_head.safetensors" +VISION_MODELS = set() + class DownloadSource(str, Enum): DEFAULT = "hf" @@ -54,8 +80,8 @@ class DownloadSource(str, Enum): def register_model_group( models: Dict[str, Dict[DownloadSource, str]], - module: Optional[str] = None, template: Optional[str] = None, + vision: bool = False, ) -> None: prefix = None for name, path in models.items(): @@ -64,10 +90,23 @@ def register_model_group( else: assert prefix == name.split("-")[0], "prefix should be identical." SUPPORTED_MODELS[name] = path - if module is not None: - DEFAULT_MODULE[prefix] = module if template is not None: DEFAULT_TEMPLATE[prefix] = template + if vision: + VISION_MODELS.add(prefix) + + +register_model_group( + models={ + "Aya-23-8B-Chat": { + DownloadSource.DEFAULT: "CohereForAI/aya-23-8B", + }, + "Aya-23-35B-Chat": { + DownloadSource.DEFAULT: "CohereForAI/aya-23-35B", + }, + }, + template="cohere", +) register_model_group( @@ -85,7 +124,6 @@ register_model_group( DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan-13B-Chat", }, }, - module="W_pack", template="baichuan", ) @@ -109,7 +147,6 @@ register_model_group( DownloadSource.MODELSCOPE: "baichuan-inc/Baichuan2-13B-Chat", }, }, - module="W_pack", template="baichuan2", ) @@ -129,7 +166,6 @@ register_model_group( DownloadSource.MODELSCOPE: "AI-ModelScope/bloom-7b1", }, }, - module="query_key_value", ) @@ -148,7 +184,6 @@ register_model_group( DownloadSource.MODELSCOPE: "AI-ModelScope/bloomz-7b1-mt", }, }, - module="query_key_value", ) @@ -167,6 +202,19 @@ register_model_group( ) +register_model_group( + models={ + "Breeze-7B": { + DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Base-v1_0", + }, + "Breeze-7B-Chat": { + DownloadSource.DEFAULT: "MediaTek-Research/Breeze-7B-Instruct-v1_0", + }, + }, + template="breeze", +) + + register_model_group( models={ "ChatGLM2-6B-Chat": { @@ -174,7 +222,6 @@ register_model_group( DownloadSource.MODELSCOPE: "ZhipuAI/chatglm2-6b", } }, - module="query_key_value", template="chatglm2", ) @@ -190,7 +237,6 @@ register_model_group( DownloadSource.MODELSCOPE: "ZhipuAI/chatglm3-6b", }, }, - module="query_key_value", template="chatglm3", ) @@ -226,6 +272,73 @@ register_model_group( ) +register_model_group( + models={ + "CodeGemma-7B": { + DownloadSource.DEFAULT: "google/codegemma-7b", + }, + "CodeGemma-7B-Chat": { + DownloadSource.DEFAULT: "google/codegemma-7b-it", + DownloadSource.MODELSCOPE: "AI-ModelScope/codegemma-7b-it", + }, + "CodeGemma-1.1-2B": { + DownloadSource.DEFAULT: "google/codegemma-1.1-2b", + }, + "CodeGemma-1.1-7B-Chat": { + DownloadSource.DEFAULT: "google/codegemma-1.1-7b-it", + }, + }, + template="gemma", +) + + +register_model_group( + models={ + "Codestral-22B-v0.1-Chat": { + DownloadSource.DEFAULT: "mistralai/Codestral-22B-v0.1", + }, + }, + template="mistral", +) + + +register_model_group( + models={ + "CommandR-35B-Chat": { + DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01", + DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-v01", + }, + "CommandR-Plus-104B-Chat": { + DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus", + DownloadSource.MODELSCOPE: "AI-ModelScope/c4ai-command-r-plus", + }, + "CommandR-35B-4bit-Chat": { + DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-v01-4bit", + DownloadSource.MODELSCOPE: "mirror013/c4ai-command-r-v01-4bit", + }, + "CommandR-Plus-104B-4bit-Chat": { + DownloadSource.DEFAULT: "CohereForAI/c4ai-command-r-plus-4bit", + }, + }, + template="cohere", +) + + +register_model_group( + models={ + "DBRX-132B-Base": { + DownloadSource.DEFAULT: "databricks/dbrx-base", + DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-base", + }, + "DBRX-132B-Chat": { + DownloadSource.DEFAULT: "databricks/dbrx-instruct", + DownloadSource.MODELSCOPE: "AI-ModelScope/dbrx-instruct", + }, + }, + template="dbrx", +) + + register_model_group( models={ "DeepSeek-LLM-7B-Base": { @@ -246,18 +359,36 @@ register_model_group( }, "DeepSeek-Math-7B-Base": { DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-base", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-base", }, "DeepSeek-Math-7B-Chat": { DownloadSource.DEFAULT: "deepseek-ai/deepseek-math-7b-instruct", + DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-math-7b-instruct", }, "DeepSeek-MoE-16B-Base": { DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-base", DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-base", }, + "DeepSeek-MoE-16B-v2-Base": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite", + }, + "DeepSeek-MoE-236B-Base": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2", + }, "DeepSeek-MoE-16B-Chat": { DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat", DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat", }, + "DeepSeek-MoE-16B-v2-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Lite-Chat", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Lite-Chat", + }, + "DeepSeek-MoE-236B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat", + }, }, template="deepseek", ) @@ -298,6 +429,9 @@ register_model_group( DownloadSource.DEFAULT: "tiiuae/falcon-7b", DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-7b", }, + "Falcon-11B": { + DownloadSource.DEFAULT: "tiiuae/falcon-11B", + }, "Falcon-40B": { DownloadSource.DEFAULT: "tiiuae/falcon-40b", DownloadSource.MODELSCOPE: "AI-ModelScope/falcon-40b", @@ -319,7 +453,6 @@ register_model_group( DownloadSource.MODELSCOPE: "modelscope/falcon-180B-chat", }, }, - module="query_key_value", template="falcon", ) @@ -342,11 +475,36 @@ register_model_group( DownloadSource.DEFAULT: "google/gemma-7b-it", DownloadSource.MODELSCOPE: "AI-ModelScope/gemma-7b-it", }, + "Gemma-1.1-2B-Chat": { + DownloadSource.DEFAULT: "google/gemma-1.1-2b-it", + }, + "Gemma-1.1-7B-Chat": { + DownloadSource.DEFAULT: "google/gemma-1.1-7b-it", + }, }, template="gemma", ) +register_model_group( + models={ + "GLM-4-9B": { + DownloadSource.DEFAULT: "THUDM/glm-4-9b", + DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b", + }, + "GLM-4-9B-Chat": { + DownloadSource.DEFAULT: "THUDM/glm-4-9b-chat", + DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat", + }, + "GLM-4-9B-1M-Chat": { + DownloadSource.DEFAULT: "THUDM/glm-4-9b-chat-1m", + DownloadSource.MODELSCOPE: "ZhipuAI/glm-4-9b-chat-1m", + }, + }, + template="glm4", +) + + register_model_group( models={ "InternLM-7B": { @@ -389,11 +547,20 @@ register_model_group( DownloadSource.MODELSCOPE: "Shanghai_AI_Laboratory/internlm2-chat-20b", }, }, - module="wqkv", template="intern2", ) +register_model_group( + models={ + "Jambda-v0.1": { + DownloadSource.DEFAULT: "ai21labs/Jamba-v0.1", + DownloadSource.MODELSCOPE: "AI-ModelScope/Jamba-v0.1", + } + }, +) + + register_model_group( models={ "LingoWhale-8B": { @@ -401,7 +568,6 @@ register_model_group( DownloadSource.MODELSCOPE: "DeepLang/LingoWhale-8B", } }, - module="qkv_proj", ) @@ -460,18 +626,72 @@ register_model_group( register_model_group( models={ - "Mistral-7B": { + "LLaMA3-8B": { + DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B", + DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B", + }, + "LLaMA3-70B": { + DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B", + DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B", + }, + "LLaMA3-8B-Chat": { + DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-8B-Instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-8B-Instruct", + }, + "LLaMA3-70B-Chat": { + DownloadSource.DEFAULT: "meta-llama/Meta-Llama-3-70B-Instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Meta-Llama-3-70B-Instruct", + }, + "LLaMA3-8B-Chinese-Chat": { + DownloadSource.DEFAULT: "shenzhi-wang/Llama3-8B-Chinese-Chat", + DownloadSource.MODELSCOPE: "LLM-Research/Llama3-8B-Chinese-Chat", + }, + "LLaMA3-70B-Chinese-Chat": { + DownloadSource.DEFAULT: "shenzhi-wang/Llama3-70B-Chinese-Chat", + }, + }, + template="llama3", +) + + +register_model_group( + models={ + "LLaVA1.5-7B-Chat": { + DownloadSource.DEFAULT: "llava-hf/llava-1.5-7b-hf", + }, + "LLaVA1.5-13B-Chat": { + DownloadSource.DEFAULT: "llava-hf/llava-1.5-13b-hf", + }, + }, + template="vicuna", + vision=True, +) + + +register_model_group( + models={ + "Mistral-7B-v0.1": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.1", }, - "Mistral-7B-Chat": { + "Mistral-7B-v0.1-Chat": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.1", }, + "Mistral-7B-v0.2": { + DownloadSource.DEFAULT: "alpindale/Mistral-7B-v0.2-hf", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-v0.2-hf", + }, "Mistral-7B-v0.2-Chat": { DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.2", DownloadSource.MODELSCOPE: "AI-ModelScope/Mistral-7B-Instruct-v0.2", }, + "Mistral-7B-v0.3": { + DownloadSource.DEFAULT: "mistralai/Mistral-7B-v0.3", + }, + "Mistral-7B-v0.3-Chat": { + DownloadSource.DEFAULT: "mistralai/Mistral-7B-Instruct-v0.3", + }, }, template="mistral", ) @@ -479,14 +699,22 @@ register_model_group( register_model_group( models={ - "Mixtral-8x7B": { + "Mixtral-8x7B-v0.1": { DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-v0.1", }, - "Mixtral-8x7B-Chat": { + "Mixtral-8x7B-v0.1-Chat": { DownloadSource.DEFAULT: "mistralai/Mixtral-8x7B-Instruct-v0.1", DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1", }, + "Mixtral-8x22B-v0.1": { + DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-v0.1", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-v0.1", + }, + "Mixtral-8x22B-v0.1-Chat": { + DownloadSource.DEFAULT: "mistralai/Mixtral-8x22B-Instruct-v0.1", + DownloadSource.MODELSCOPE: "AI-ModelScope/Mixtral-8x22B-Instruct-v0.1", + }, }, template="mistral", ) @@ -495,18 +723,18 @@ register_model_group( register_model_group( models={ "OLMo-1B": { - DownloadSource.DEFAULT: "allenai/OLMo-1B", + DownloadSource.DEFAULT: "allenai/OLMo-1B-hf", }, "OLMo-7B": { - DownloadSource.DEFAULT: "allenai/OLMo-7B", - DownloadSource.MODELSCOPE: "AI-ModelScope/OLMo-7B", + DownloadSource.DEFAULT: "allenai/OLMo-7B-hf", }, "OLMo-7B-Chat": { - DownloadSource.DEFAULT: "allenai/OLMo-7B-Instruct", + DownloadSource.DEFAULT: "ssec-uw/OLMo-7B-Instruct-hf", + }, + "OLMo-1.7-7B": { + DownloadSource.DEFAULT: "allenai/OLMo-1.7-7B-hf", }, }, - module="att_proj", - template="olmo", ) @@ -514,13 +742,23 @@ register_model_group( models={ "OpenChat3.5-7B-Chat": { DownloadSource.DEFAULT: "openchat/openchat-3.5-0106", - DownloadSource.MODELSCOPE: "myxiongmodel/openchat_3.5", + DownloadSource.MODELSCOPE: "xcwzxcwz/openchat-3.5-0106", } }, template="openchat", ) +register_model_group( + models={ + "OpenChat3.6-8B-Chat": { + DownloadSource.DEFAULT: "openchat/openchat-3.6-8b-20240522", + } + }, + template="openchat-3.6", +) + + register_model_group( models={ "Orion-14B-Base": { @@ -548,6 +786,33 @@ register_model_group( ) +register_model_group( + models={ + "PaliGemma-3B-pt-224": { + DownloadSource.DEFAULT: "google/paligemma-3b-pt-224", + DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-224", + }, + "PaliGemma-3B-pt-448": { + DownloadSource.DEFAULT: "google/paligemma-3b-pt-448", + DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-448", + }, + "PaliGemma-3B-pt-896": { + DownloadSource.DEFAULT: "google/paligemma-3b-pt-896", + DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-pt-896", + }, + "PaliGemma-3B-mix-224": { + DownloadSource.DEFAULT: "google/paligemma-3b-mix-224", + DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-224", + }, + "PaliGemma-3B-mix-448": { + DownloadSource.DEFAULT: "google/paligemma-3b-mix-448", + DownloadSource.MODELSCOPE: "AI-ModelScope/paligemma-3b-mix-448", + }, + }, + vision=True, +) + + register_model_group( models={ "Phi-1.5-1.3B": { @@ -562,6 +827,37 @@ register_model_group( ) +register_model_group( + models={ + "Phi3-4B-4k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-mini-4k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-4k-instruct", + }, + "Phi3-4B-128k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-mini-128k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-mini-128k-instruct", + }, + "Phi3-7B-8k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-small-8k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-8k-instruct", + }, + "Phi3-7B-128k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-small-128k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-small-128k-instruct", + }, + "Phi3-14B-8k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-medium-4k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-4k-instruct", + }, + "Phi3-14B-128k-Chat": { + DownloadSource.DEFAULT: "microsoft/Phi-3-medium-128k-instruct", + DownloadSource.MODELSCOPE: "LLM-Research/Phi-3-medium-128k-instruct", + }, + }, + template="phi", +) + + register_model_group( models={ "Qwen-1.8B": { @@ -629,7 +925,6 @@ register_model_group( DownloadSource.MODELSCOPE: "qwen/Qwen-72B-Chat-Int4", }, }, - module="c_attn", template="qwen", ) @@ -656,10 +951,26 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B", }, + "Qwen1.5-32B": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B", + }, "Qwen1.5-72B": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B", }, + "Qwen1.5-110B": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B", + }, + "Qwen1.5-MoE-A2.7B": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B", + }, + "Qwen1.5-Code-7B": { + DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B", + DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B", + }, "Qwen1.5-0.5B-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat", @@ -680,10 +991,26 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat", }, + "Qwen1.5-32B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat", + }, "Qwen1.5-72B-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat", }, + "Qwen1.5-110B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B-Chat", + }, + "Qwen1.5-MoE-A2.7B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat", + }, + "Qwen1.5-Code-7B-Chat": { + DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat", + DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat", + }, "Qwen1.5-0.5B-int8-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8", @@ -724,6 +1051,10 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-14B-Chat-AWQ", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-14B-Chat-AWQ", }, + "Qwen1.5-32B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-32B-Chat-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-32B-Chat-AWQ", + }, "Qwen1.5-72B-int8-Chat": { DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-GPTQ-Int8", @@ -732,6 +1063,101 @@ register_model_group( DownloadSource.DEFAULT: "Qwen/Qwen1.5-72B-Chat-AWQ", DownloadSource.MODELSCOPE: "qwen/Qwen1.5-72B-Chat-AWQ", }, + "Qwen1.5-110B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-110B-Chat-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-110B-Chat-AWQ", + }, + "Qwen1.5-MoE-A2.7B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4", + }, + "Qwen1.5-Code-7B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/CodeQwen1.5-7B-Chat-AWQ", + DownloadSource.MODELSCOPE: "qwen/CodeQwen1.5-7B-Chat-AWQ", + }, + }, + template="qwen", +) + + +register_model_group( + models={ + "Qwen2-0.5B": { + DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B", + DownloadSource.MODELSCOPE: "qwen/Qwen2-0.5B", + }, + "Qwen2-1.5B": { + DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B", + DownloadSource.MODELSCOPE: "qwen/Qwen2-1.5B", + }, + "Qwen2-7B": { + DownloadSource.DEFAULT: "Qwen/Qwen2-7B", + DownloadSource.MODELSCOPE: "qwen/Qwen2-7B", + }, + "Qwen2-72B": { + DownloadSource.DEFAULT: "Qwen/Qwen2-72B", + DownloadSource.MODELSCOPE: "qwen/Qwen2-72B", + }, + "Qwen2-MoE-57B": { + DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B", + DownloadSource.MODELSCOPE: "qwen/Qwen2-57B-A14B", + }, + "Qwen2-0.5B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct", + DownloadSource.MODELSCOPE: "qwen/Qwen2-0.5B-Instruct", + }, + "Qwen2-1.5B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct", + DownloadSource.MODELSCOPE: "qwen/Qwen2-1.5B-Instruct", + }, + "Qwen2-7B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct", + DownloadSource.MODELSCOPE: "qwen/Qwen2-7B-Instruct", + }, + "Qwen2-72B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct", + DownloadSource.MODELSCOPE: "qwen/Qwen2-72B-Instruct", + }, + "Qwen2-MoE-57B-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct", + DownloadSource.MODELSCOPE: "qwen/Qwen2-57B-A14B-Instruct", + }, + "Qwen2-0.5B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen2-0.5B-Instruct-GPTQ-Int8", + }, + "Qwen2-0.5B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-0.5B-Instruct-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen2-0.5B-Instruct-AWQ", + }, + "Qwen2-1.5B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen2-1.5B-Instruct-GPTQ-Int8", + }, + "Qwen2-1.5B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-1.5B-Instruct-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen2-1.5B-Instruct-AWQ", + }, + "Qwen2-7B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-GPTQ-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen2-7B-Instruct-GPTQ-Int8", + }, + "Qwen2-7B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-7B-Instruct-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen2-7B-Instruct-AWQ", + }, + "Qwen2-72B-int8-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-GPTQ-Int8", + DownloadSource.MODELSCOPE: "qwen/Qwen2-72B-Instruct-GPTQ-Int8", + }, + "Qwen2-72B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-72B-Instruct-AWQ", + DownloadSource.MODELSCOPE: "qwen/Qwen2-72B-Instruct-AWQ", + }, + "Qwen2-MoE-57B-int4-Chat": { + DownloadSource.DEFAULT: "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4", + DownloadSource.MODELSCOPE: "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4", + }, }, template="qwen", ) @@ -765,17 +1191,39 @@ register_model_group( models={ "StarCoder2-3B": { DownloadSource.DEFAULT: "bigcode/starcoder2-3b", + DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-3b", }, "StarCoder2-7B": { DownloadSource.DEFAULT: "bigcode/starcoder2-7b", + DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-7b", }, "StarCoder2-15B": { DownloadSource.DEFAULT: "bigcode/starcoder2-15b", + DownloadSource.MODELSCOPE: "AI-ModelScope/starcoder2-15b", }, } ) +register_model_group( + models={ + "TeleChat-7B-Chat": { + DownloadSource.DEFAULT: "Tele-AI/telechat-7B", + DownloadSource.MODELSCOPE: "TeleAI/telechat-7B", + }, + "TeleChat-12B-Chat": { + DownloadSource.DEFAULT: "Tele-AI/TeleChat-12B", + DownloadSource.MODELSCOPE: "TeleAI/TeleChat-12B", + }, + "TeleChat-12B-v2-Chat": { + DownloadSource.DEFAULT: "Tele-AI/TeleChat-12B-v2", + DownloadSource.MODELSCOPE: "TeleAI/TeleChat-12B-v2", + }, + }, + template="telechat", +) + + register_model_group( models={ "Vicuna1.5-7B-Chat": { @@ -793,17 +1241,53 @@ register_model_group( register_model_group( models={ + "XuanYuan-6B": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B", + }, "XuanYuan-70B": { DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B", + }, + "XuanYuan-2-70B": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B", + }, + "XuanYuan-6B-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat", }, "XuanYuan-70B-Chat": { DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat", + }, + "XuanYuan-2-70B-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat", + }, + "XuanYuan-6B-int8-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-8bit", + }, + "XuanYuan-6B-int4-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-6B-Chat-4bit", }, "XuanYuan-70B-int8-Chat": { DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-8bit", }, "XuanYuan-70B-int4-Chat": { DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan-70B-Chat-4bit", + }, + "XuanYuan-2-70B-int8-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit", + }, + "XuanYuan-2-70B-int4-Chat": { + DownloadSource.DEFAULT: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit", + DownloadSource.MODELSCOPE: "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit", }, }, template="xuanyuan", @@ -840,6 +1324,30 @@ register_model_group( DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat", DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat", }, + "XVERSE-MoE-A4.2B": { + DownloadSource.DEFAULT: "xverse/XVERSE-MoE-A4.2B", + DownloadSource.MODELSCOPE: "xverse/XVERSE-MoE-A4.2B", + }, + "XVERSE-7B-int8-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int8", + DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int8", + }, + "XVERSE-7B-int4-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-7B-Chat-GPTQ-Int4", + DownloadSource.MODELSCOPE: "xverse/XVERSE-7B-Chat-GPTQ-Int4", + }, + "XVERSE-13B-int8-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int8", + DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int8", + }, + "XVERSE-13B-int4-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-13B-Chat-GPTQ-Int4", + DownloadSource.MODELSCOPE: "xverse/XVERSE-13B-Chat-GPTQ-Int4", + }, + "XVERSE-65B-int4-Chat": { + DownloadSource.DEFAULT: "xverse/XVERSE-65B-Chat-GPTQ-Int4", + DownloadSource.MODELSCOPE: "xverse/XVERSE-65B-Chat-GPTQ-Int4", + }, }, template="xverse", ) @@ -898,11 +1406,49 @@ register_model_group( DownloadSource.DEFAULT: "01-ai/Yi-34B-Chat-4bits", DownloadSource.MODELSCOPE: "01ai/Yi-34B-Chat-4bits", }, + "Yi-1.5-6B": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B", + }, + "Yi-1.5-9B": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B", + }, + "Yi-1.5-34B": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B", + }, + "Yi-1.5-6B-Chat": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-6B-Chat", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-6B-Chat", + }, + "Yi-1.5-9B-Chat": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-9B-Chat", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-9B-Chat", + }, + "Yi-1.5-34B-Chat": { + DownloadSource.DEFAULT: "01-ai/Yi-1.5-34B-Chat", + DownloadSource.MODELSCOPE: "01ai/Yi-1.5-34B-Chat", + }, }, template="yi", ) +register_model_group( + models={ + "YiVL-6B-Chat": { + DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-6B-hf", + }, + "YiVL-34B-Chat": { + DownloadSource.DEFAULT: "BUAADreamer/Yi-VL-34B-hf", + }, + }, + template="yi_vl", + vision=True, +) + + register_model_group( models={ "Yuan2-2B-Chat": { @@ -932,21 +1478,9 @@ register_model_group( DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-7b-beta", DownloadSource.MODELSCOPE: "modelscope/zephyr-7b-beta", }, + "Zephyr-141B-ORPO-Chat": { + DownloadSource.DEFAULT: "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1", + }, }, template="zephyr", ) - - -register_model_group( - models={ - "Atom-7B": { - DownloadSource.DEFAULT: "FlagAlpha/Atom-7B", - DownloadSource.MODELSCOPE: "FlagAlpha/Atom-7B", - }, - "Atom-7B-Chat": { - DownloadSource.DEFAULT: "FlagAlpha/Atom-7B-Chat", - DownloadSource.MODELSCOPE: "FlagAlpha/Atom-7B-Chat", - }, - }, - template="atom", -) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/env.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/env.py new file mode 100644 index 0000000..cd81442 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/env.py @@ -0,0 +1,55 @@ +import platform + +import accelerate +import datasets +import peft +import torch +import transformers +import trl +from transformers.integrations import is_deepspeed_available +from transformers.utils import is_bitsandbytes_available, is_torch_cuda_available, is_torch_npu_available + +from .packages import is_vllm_available + + +VERSION = "0.8.0" + + +def print_env() -> None: + info = { + "`llamafactory` version": VERSION, + "Platform": platform.platform(), + "Python version": platform.python_version(), + "PyTorch version": torch.__version__, + "Transformers version": transformers.__version__, + "Datasets version": datasets.__version__, + "Accelerate version": accelerate.__version__, + "PEFT version": peft.__version__, + "TRL version": trl.__version__, + } + + if is_torch_cuda_available(): + info["PyTorch version"] += " (GPU)" + info["GPU type"] = torch.cuda.get_device_name() + + if is_torch_npu_available(): + info["PyTorch version"] += " (NPU)" + info["NPU type"] = torch.npu.get_device_name() + info["CANN version"] = torch.version.cann + + if is_deepspeed_available(): + import deepspeed # type: ignore + + info["DeepSpeed version"] = deepspeed.__version__ + + if is_bitsandbytes_available(): + import bitsandbytes + + info["Bitsandbytes version"] = bitsandbytes.__version__ + + if is_vllm_available(): + import vllm + + info["vLLM version"] = vllm.__version__ + + print("\n" + "\n".join(["- {}: {}".format(key, value) for key, value in info.items()]) + "\n") diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/logging.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/logging.py new file mode 100644 index 0000000..430b8a4 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/logging.py @@ -0,0 +1,68 @@ +import logging +import os +import sys +from concurrent.futures import ThreadPoolExecutor + +from .constants import RUNNING_LOG + + +class LoggerHandler(logging.Handler): + r""" + Logger handler used in Web UI. + """ + + def __init__(self, output_dir: str) -> None: + super().__init__() + formatter = logging.Formatter( + fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S" + ) + self.setLevel(logging.INFO) + self.setFormatter(formatter) + + os.makedirs(output_dir, exist_ok=True) + self.running_log = os.path.join(output_dir, RUNNING_LOG) + if os.path.exists(self.running_log): + os.remove(self.running_log) + + self.thread_pool = ThreadPoolExecutor(max_workers=1) + + def _write_log(self, log_entry: str) -> None: + with open(self.running_log, "a", encoding="utf-8") as f: + f.write(log_entry + "\n\n") + + def emit(self, record) -> None: + if record.name == "httpx": + return + + log_entry = self.format(record) + self.thread_pool.submit(self._write_log, log_entry) + + def close(self) -> None: + self.thread_pool.shutdown(wait=True) + return super().close() + + +def get_logger(name: str) -> logging.Logger: + r""" + Gets a standard logger with a stream hander to stdout. + """ + formatter = logging.Formatter( + fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S" + ) + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(formatter) + + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + logger.addHandler(handler) + + return logger + + +def reset_logging() -> None: + r""" + Removes basic config of root logger. (unused in script) + """ + root = logging.getLogger() + list(map(root.removeHandler, root.handlers)) + list(map(root.removeFilter, root.filters)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/misc.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/misc.py similarity index 75% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/misc.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/misc.py index 21d4b4c..fc33f77 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/misc.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/misc.py @@ -30,7 +30,7 @@ except Exception: if TYPE_CHECKING: from trl import AutoModelForCausalLMWithValueHead - from llmtuner.hparams import ModelArguments + from ..hparams import ModelArguments logger = get_logger(__name__) @@ -58,14 +58,14 @@ class AverageMeter: def check_dependencies() -> None: - if int(os.environ.get("DISABLE_VERSION_CHECK", "0")): + if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.37.2", "To fix: pip install transformers>=4.37.2") - require_version("datasets>=2.14.3", "To fix: pip install datasets>=2.14.3") - require_version("accelerate>=0.27.2", "To fix: pip install accelerate>=0.27.2") - require_version("peft>=0.9.0", "To fix: pip install peft>=0.9.0") - require_version("trl>=0.7.11", "To fix: pip install trl>=0.7.11") + require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2") + require_version("datasets>=2.16.0", "To fix: pip install datasets>=2.16.0") + require_version("accelerate>=0.30.1", "To fix: pip install accelerate>=0.30.1") + require_version("peft>=0.11.1", "To fix: pip install peft>=0.11.1") + require_version("trl>=0.8.6", "To fix: pip install trl>=0.8.6") def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: @@ -81,7 +81,14 @@ def count_parameters(model: torch.nn.Module) -> Tuple[int, int]: # Due to the design of 4bit linear layers from bitsandbytes, multiply the number of parameters by 2 if param.__class__.__name__ == "Params4bit": - num_params = num_params * 2 + if hasattr(param, "quant_storage") and hasattr(param.quant_storage, "itemsize"): + num_bytes = param.quant_storage.itemsize + elif hasattr(param, "element_size"): # for older pytorch version + num_bytes = param.element_size() + else: + num_bytes = 1 + + num_params = num_params * 2 * num_bytes all_param += num_params if param.requires_grad: @@ -158,13 +165,15 @@ def get_current_device() -> torch.device: def get_device_count() -> int: r""" - Gets the number of available GPU devices. + Gets the number of available GPU or NPU devices. """ - if not torch.cuda.is_available(): + if is_torch_npu_available(): + return torch.npu.device_count() + elif is_torch_cuda_available(): + return torch.cuda.device_count() + else: return 0 - return torch.cuda.device_count() - def get_logits_processor() -> "LogitsProcessorList": r""" @@ -187,30 +196,47 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype: return torch.float32 +def is_gpu_or_npu_available() -> bool: + r""" + Checks if the GPU or NPU is available. + """ + return is_torch_npu_available() or is_torch_cuda_available() + + +def has_tokenized_data(path: os.PathLike) -> bool: + r""" + Checks if the path has a tokenized dataset. + """ + return os.path.isdir(path) and len(os.listdir(path)) > 0 + + def torch_gc() -> None: r""" - Collects GPU memory. + Collects GPU or NPU memory. """ gc.collect() - if torch.cuda.is_available(): + if is_torch_xpu_available(): + torch.xpu.empty_cache() + elif is_torch_npu_available(): + torch.npu.empty_cache() + elif is_torch_mps_available(): + torch.mps.empty_cache() + elif is_torch_cuda_available(): torch.cuda.empty_cache() - torch.cuda.ipc_collect() -def try_download_model_from_ms(model_args: "ModelArguments") -> None: +def try_download_model_from_ms(model_args: "ModelArguments") -> str: if not use_modelscope() or os.path.exists(model_args.model_name_or_path): - return + return model_args.model_name_or_path try: from modelscope import snapshot_download revision = "master" if model_args.model_revision == "main" else model_args.model_revision - model_args.model_name_or_path = snapshot_download( - model_args.model_name_or_path, revision=revision, cache_dir=model_args.cache_dir - ) + return snapshot_download(model_args.model_name_or_path, revision=revision, cache_dir=model_args.cache_dir) except ImportError: raise ImportError("Please install modelscope via `pip install modelscope -U`") def use_modelscope() -> bool: - return bool(int(os.environ.get("USE_MODELSCOPE_HUB", "0"))) + return os.environ.get("USE_MODELSCOPE_HUB", "0").lower() in ["true", "1"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/packages.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/packages.py similarity index 63% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/packages.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/packages.py index cf10ffd..4c9e649 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/packages.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/packages.py @@ -1,30 +1,41 @@ import importlib.metadata import importlib.util +from typing import TYPE_CHECKING + +from packaging import version + + +if TYPE_CHECKING: + from packaging.version import Version def _is_package_available(name: str) -> bool: return importlib.util.find_spec(name) is not None -def _get_package_version(name: str) -> str: +def _get_package_version(name: str) -> "Version": try: - return importlib.metadata.version(name) + return version.parse(importlib.metadata.version(name)) except Exception: - return "0.0.0" + return version.parse("0.0.0") -def is_fastapi_availble(): +def is_fastapi_available(): return _is_package_available("fastapi") def is_flash_attn2_available(): - return _is_package_available("flash_attn") and _get_package_version("flash_attn").startswith("2") + return _is_package_available("flash_attn") and _get_package_version("flash_attn") > version.parse("2.0.0") def is_galore_available(): return _is_package_available("galore_torch") +def is_gradio_available(): + return _is_package_available("gradio") + + def is_jieba_available(): return _is_package_available("jieba") @@ -37,6 +48,10 @@ def is_nltk_available(): return _is_package_available("nltk") +def is_pillow_available(): + return _is_package_available("PIL") + + def is_requests_available(): return _is_package_available("requests") @@ -45,14 +60,14 @@ def is_rouge_available(): return _is_package_available("rouge_chinese") +def is_sdpa_available(): + return _get_package_version("torch") > version.parse("2.1.1") + + def is_starlette_available(): return _is_package_available("sse_starlette") -def is_unsloth_available(): - return _is_package_available("unsloth") - - def is_uvicorn_available(): return _is_package_available("uvicorn") diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/ploting.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/ploting.py similarity index 65% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/ploting.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/ploting.py index aa101cb..dea23bb 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/ploting.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/extras/ploting.py @@ -1,7 +1,7 @@ import json import math import os -from typing import List +from typing import Any, Dict, List from transformers.trainer import TRAINER_STATE_NAME @@ -10,6 +10,7 @@ from .packages import is_matplotlib_available if is_matplotlib_available(): + import matplotlib.figure import matplotlib.pyplot as plt @@ -20,8 +21,11 @@ def smooth(scalars: List[float]) -> List[float]: r""" EMA implementation according to TensorBoard. """ + if len(scalars) == 0: + return [] + last = scalars[0] - smoothed = list() + smoothed = [] weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5) # a sigmoid function for next_val in scalars: smoothed_val = last * weight + (1 - weight) * next_val @@ -30,7 +34,33 @@ def smooth(scalars: List[float]) -> List[float]: return smoothed +def gen_loss_plot(trainer_log: List[Dict[str, Any]]) -> "matplotlib.figure.Figure": + r""" + Plots loss curves in LlamaBoard. + """ + plt.close("all") + plt.switch_backend("agg") + fig = plt.figure() + ax = fig.add_subplot(111) + steps, losses = [], [] + for log in trainer_log: + if log.get("loss", None): + steps.append(log["current_steps"]) + losses.append(log["loss"]) + + ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original") + ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed") + ax.legend() + ax.set_xlabel("step") + ax.set_ylabel("loss") + return fig + + def plot_loss(save_dictionary: os.PathLike, keys: List[str] = ["loss"]) -> None: + r""" + Plots loss curves and saves the image. + """ + plt.switch_backend("agg") with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), "r", encoding="utf-8") as f: data = json.load(f) @@ -52,6 +82,6 @@ def plot_loss(save_dictionary: os.PathLike, keys: List[str] = ["loss"]) -> None: plt.xlabel("step") plt.ylabel(key) plt.legend() - figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace(os.path.sep, "_"))) + figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_"))) plt.savefig(figure_path, format="png", dpi=100) print("Figure saved at:", figure_path) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/data_args.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/data_args.py similarity index 93% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/data_args.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/data_args.py index 76e6d6d..1e0cd08 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/data_args.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/data_args.py @@ -26,11 +26,11 @@ class DataArguments: ) cutoff_len: int = field( default=1024, - metadata={"help": "The cutoff length of the model inputs after tokenization."}, + metadata={"help": "The cutoff length of the tokenized inputs in the dataset."}, ) reserved_label_len: int = field( default=1, - metadata={"help": "The minimum cutoff length reserved for label after tokenization."}, + metadata={"help": "The minimum cutoff length reserved for the tokenized labels in the dataset."}, ) train_on_prompt: bool = field( default=False, @@ -84,9 +84,9 @@ class DataArguments: "help": "Whether or not to pack the sequences in training. Will automatically enable in pre-training." }, ) - cache_path: Optional[str] = field( + tokenized_path: Optional[str] = field( default=None, - metadata={"help": "Path to save or load the pre-processed datasets."}, + metadata={"help": "Path to save or load the tokenized datasets."}, ) def __post_init__(self): diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/evaluation_args.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/evaluation_args.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/evaluation_args.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/evaluation_args.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/finetuning_args.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/finetuning_args.py similarity index 50% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/finetuning_args.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/finetuning_args.py index be1fd12..08af31e 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/finetuning_args.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/finetuning_args.py @@ -1,5 +1,4 @@ -import json -from dataclasses import asdict, dataclass, field +from dataclasses import dataclass, field from typing import Literal, Optional @@ -9,22 +8,35 @@ class FreezeArguments: Arguments pertaining to the freeze (partial-parameter) training. """ - name_module_trainable: str = field( - default="all", + freeze_trainable_layers: int = field( + default=2, metadata={ - "help": """Name of trainable modules for partial-parameter (freeze) fine-tuning. \ - Use commas to separate multiple modules. \ - Use "all" to specify all the available modules. \ - LLaMA choices: ["mlp", "self_attn"], \ - BLOOM & Falcon & ChatGLM choices: ["mlp", "self_attention"], \ - Qwen choices: ["mlp", "attn"], \ - InternLM2 choices: ["feed_forward", "attention"], \ - Others choices: the same as LLaMA.""" + "help": ( + "The number of trainable layers for freeze (partial-parameter) fine-tuning. " + "Positive numbers mean the last n layers are set as trainable, " + "negative numbers mean the first n layers are set as trainable." + ) }, ) - num_layer_trainable: int = field( - default=2, - metadata={"help": "The number of trainable layers for partial-parameter (freeze) fine-tuning."}, + freeze_trainable_modules: str = field( + default="all", + metadata={ + "help": ( + "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. " + "Use commas to separate multiple modules. " + "Use `all` to specify all the available modules." + ) + }, + ) + freeze_extra_modules: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Name(s) of modules apart from hidden layers to be set as trainable " + "for freeze (partial-parameter) fine-tuning. " + "Use commas to separate multiple modules." + ) + }, ) @@ -37,7 +49,11 @@ class LoraArguments: additional_target: Optional[str] = field( default=None, metadata={ - "help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint." + "help": ( + "Name(s) of modules apart from LoRA layers to be set as trainable " + "and saved in the final checkpoint. " + "Use commas to separate multiple modules." + ) }, ) lora_alpha: Optional[int] = field( @@ -55,17 +71,21 @@ class LoraArguments: lora_target: str = field( default="all", metadata={ - "help": """Name(s) of target modules to apply LoRA. \ - Use commas to separate multiple modules. \ - Use "all" to specify all the available modules. \ - LLaMA choices: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], \ - BLOOM & Falcon & ChatGLM choices: ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], \ - Baichuan choices: ["W_pack", "o_proj", "gate_proj", "up_proj", "down_proj"], \ - Qwen choices: ["c_attn", "attn.c_proj", "w1", "w2", "mlp.c_proj"], \ - InternLM2 choices: ["wqkv", "wo", "w1", "w2", "w3"], \ - Others choices: the same as LLaMA.""" + "help": ( + "Name(s) of target modules to apply LoRA. " + "Use commas to separate multiple modules. " + "Use `all` to specify all the linear modules." + ) }, ) + loraplus_lr_ratio: Optional[float] = field( + default=None, + metadata={"help": "LoRA plus learning rate ratio (lr_B / lr_A)."}, + ) + loraplus_lr_embedding: float = field( + default=1e-6, + metadata={"help": "LoRA plus learning rate for lora embedding layers."}, + ) use_rslora: bool = field( default=False, metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."}, @@ -83,20 +103,36 @@ class LoraArguments: @dataclass class RLHFArguments: r""" - Arguments pertaining to the PPO and DPO training. + Arguments pertaining to the PPO, DPO and KTO training. """ - dpo_beta: float = field( + pref_beta: float = field( default=0.1, - metadata={"help": "The beta parameter for the DPO loss."}, + metadata={"help": "The beta parameter in the preference loss."}, ) - dpo_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair"] = field( + pref_ftx: float = field( + default=0.0, + metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."}, + ) + pref_loss: Literal["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"] = field( default="sigmoid", metadata={"help": "The type of DPO loss to use."}, ) - dpo_ftx: float = field( + dpo_label_smoothing: float = field( default=0.0, - metadata={"help": "The supervised fine-tuning loss coefficient in DPO training."}, + metadata={"help": "The robust DPO label smoothing parameter in cDPO that should be between 0 and 0.5."}, + ) + kto_chosen_weight: float = field( + default=1.0, + metadata={"help": "The weight factor of the desirable losses in KTO training."}, + ) + kto_rejected_weight: float = field( + default=1.0, + metadata={"help": "The weight factor of the undesirable losses in KTO training."}, + ) + simpo_gamma: float = field( + default=0.5, + metadata={"help": "The target reward margin term in SimPO loss."}, ) ppo_buffer_size: int = field( default=1, @@ -106,10 +142,6 @@ class RLHFArguments: default=4, metadata={"help": "The number of epochs to perform in a PPO optimization step."}, ) - ppo_logger: Optional[str] = field( - default=None, - metadata={"help": 'Log with either "wandb" or "tensorboard" in PPO training.'}, - ) ppo_score_norm: bool = field( default=False, metadata={"help": "Use score normalization in PPO training."}, @@ -160,11 +192,16 @@ class GaloreArguments: use_galore: bool = field( default=False, - metadata={"help": "Whether or not to use gradient low-Rank projection."}, + metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."}, ) galore_target: str = field( - default="mlp,attn", - metadata={"help": "Name(s) of modules to apply GaLore. Use commas to separate multiple modules."}, + default="all", + metadata={ + "help": ( + "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. " + "Use `all` to specify all the linear modules." + ) + }, ) galore_rank: int = field( default=16, @@ -189,7 +226,60 @@ class GaloreArguments: @dataclass -class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments): +class BAdamArgument: + r""" + Arguments pertaining to the BAdam optimizer. + """ + + use_badam: bool = field( + default=False, + metadata={"help": "Whether or not to use the BAdam optimizer."}, + ) + badam_mode: Literal["layer", "ratio"] = field( + default="layer", + metadata={"help": "Whether to use layer-wise or ratio-wise BAdam optimizer."}, + ) + badam_start_block: Optional[int] = field( + default=None, + metadata={"help": "The starting block index for layer-wise BAdam."}, + ) + badam_switch_mode: Optional[Literal["ascending", "descending", "random", "fixed"]] = field( + default="ascending", + metadata={"help": "the strategy of picking block to update for layer-wise BAdam."}, + ) + badam_switch_interval: Optional[int] = field( + default=50, + metadata={ + "help": "Number of steps to update the block for layer-wise BAdam. Use -1 to disable the block update." + }, + ) + badam_update_ratio: float = field( + default=0.05, + metadata={"help": "The ratio of the update for ratio-wise BAdam."}, + ) + badam_mask_mode: Literal["adjacent", "scatter"] = field( + default="adjacent", + metadata={ + "help": ( + "The mode of the mask for BAdam optimizer. " + "`adjacent` means that the trainable parameters are adjacent to each other, " + "`scatter` means that trainable parameters are randomly choosed from the weight." + ) + }, + ) + badam_verbose: int = field( + default=0, + metadata={ + "help": ( + "The verbosity level of BAdam optimizer. " + "0 for no print, 1 for print the block prefix, 2 for print trainable parameters." + ) + }, + ) + + +@dataclass +class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreArguments, BAdamArgument): r""" Arguments pertaining to which techniques we are going to fine-tuning with. """ @@ -198,7 +288,7 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."}, ) - stage: Literal["pt", "sft", "rm", "ppo", "dpo"] = field( + stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto"] = field( default="sft", metadata={"help": "Which stage will be performed in training."}, ) @@ -210,6 +300,14 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA default=False, metadata={"help": "Whether or not to make only the parameters in the expanded blocks trainable."}, ) + freeze_vision_tower: bool = field( + default=True, + metadata={"help": "Whether ot not to freeze vision tower in MLLM training."}, + ) + train_mm_proj_only: bool = field( + default=False, + metadata={"help": "Whether or not to train the multimodal projector for MLLM only."}, + ) plot_loss: bool = field( default=False, metadata={"help": "Whether or not to save the training loss curves."}, @@ -221,37 +319,40 @@ class FinetuningArguments(FreezeArguments, LoraArguments, RLHFArguments, GaloreA return [item.strip() for item in arg.split(",")] return arg - self.name_module_trainable = split_arg(self.name_module_trainable) + self.freeze_trainable_modules = split_arg(self.freeze_trainable_modules) + self.freeze_extra_modules = split_arg(self.freeze_extra_modules) self.lora_alpha = self.lora_alpha or self.lora_rank * 2 self.lora_target = split_arg(self.lora_target) self.additional_target = split_arg(self.additional_target) + self.galore_target = split_arg(self.galore_target) + self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only assert self.finetuning_type in ["lora", "freeze", "full"], "Invalid fine-tuning method." assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." assert self.reward_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." + self.use_ref_model = self.pref_loss not in ["orpo", "simpo"] + if self.stage == "ppo" and self.reward_model is None: raise ValueError("`reward_model` is necessary for PPO training.") if self.stage == "ppo" and self.reward_model_type == "lora" and self.finetuning_type != "lora": raise ValueError("`reward_model_type` cannot be lora for Freeze/Full PPO training.") + if self.stage == "dpo" and self.pref_loss != "sigmoid" and self.dpo_label_smoothing > 1e-6: + raise ValueError("`dpo_label_smoothing` is only valid for sigmoid loss function.") + if self.use_llama_pro and self.finetuning_type == "full": - raise ValueError("`use_llama_pro` is only valid for the Freeze or LoRA method.") + raise ValueError("`use_llama_pro` is only valid for Freeze or LoRA training.") - if self.use_galore and self.finetuning_type == "lora": - raise ValueError("Cannot use LoRA with GaLore together.") + if self.finetuning_type == "lora" and (self.use_galore or self.use_badam): + raise ValueError("Cannot use LoRA with GaLore or BAdam together.") - def save_to_json(self, json_path: str): - r"""Saves the content of this instance in JSON format inside `json_path`.""" - json_string = json.dumps(asdict(self), indent=2, sort_keys=True) + "\n" - with open(json_path, "w", encoding="utf-8") as f: - f.write(json_string) + if self.use_galore and self.use_badam: + raise ValueError("Cannot use GaLore with BAdam together.") - @classmethod - def load_from_json(cls, json_path: str): - r"""Creates an instance from the content of `json_path`.""" - with open(json_path, "r", encoding="utf-8") as f: - text = f.read() + if self.loraplus_lr_ratio is not None and self.finetuning_type != "lora": + raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.") - return cls(**json.loads(text)) + if self.train_mm_proj_only and self.finetuning_type != "full": + raise ValueError("`train_mm_proj_only` is only valid for full training.") diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/generating_args.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/generating_args.py similarity index 88% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/generating_args.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/generating_args.py index 70dabb3..0ee17d1 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/generating_args.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/generating_args.py @@ -1,5 +1,5 @@ from dataclasses import asdict, dataclass, field -from typing import Any, Dict +from typing import Any, Dict, Optional @dataclass @@ -31,11 +31,11 @@ class GeneratingArguments: metadata={"help": "Number of beams for beam search. 1 means no beam search."}, ) max_length: int = field( - default=512, + default=1024, metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."}, ) max_new_tokens: int = field( - default=512, + default=1024, metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."}, ) repetition_penalty: float = field( @@ -46,6 +46,10 @@ class GeneratingArguments: default=1.0, metadata={"help": "Exponential penalty to the length that is used with beam-based generation."}, ) + default_system: Optional[str] = field( + default=None, + metadata={"help": "Default system message to use in chat completion."}, + ) def to_dict(self) -> Dict[str, Any]: args = asdict(self) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/model_args.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/model_args.py similarity index 73% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/model_args.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/model_args.py index a371958..6352a42 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/model_args.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/model_args.py @@ -15,14 +15,19 @@ class ModelArguments: ) adapter_name_or_path: Optional[str] = field( default=None, - metadata={"help": "Path to the adapter weight or identifier from huggingface.co/models."}, + metadata={ + "help": ( + "Path to the adapter weight or identifier from huggingface.co/models. " + "Use commas to separate multiple adapters." + ) + }, ) cache_dir: Optional[str] = field( default=None, metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."}, ) use_fast_tokenizer: bool = field( - default=False, + default=True, metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."}, ) resize_vocab: bool = field( @@ -33,6 +38,10 @@ class ModelArguments: default=False, metadata={"help": "Whether or not the special tokens should be split during the tokenization process."}, ) + new_special_tokens: Optional[str] = field( + default=None, + metadata={"help": "Special tokens to be added into the tokenizer. Use commas to separate multiple tokens."}, + ) model_revision: str = field( default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, @@ -53,22 +62,38 @@ class ModelArguments: default=True, metadata={"help": "Whether or not to use double quantization in int4 training."}, ) + quantization_device_map: Optional[Literal["auto"]] = field( + default=None, + metadata={"help": "Device map used to infer the 4-bit quantized model, needs bitsandbytes>=0.43.0."}, + ) rope_scaling: Optional[Literal["linear", "dynamic"]] = field( default=None, metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."}, ) - flash_attn: bool = field( - default=False, - metadata={"help": "Enable FlashAttention-2 for faster training."}, + flash_attn: Literal["off", "sdpa", "fa2", "auto"] = field( + default="auto", + metadata={"help": "Enable FlashAttention for faster training and inference."}, ) shift_attn: bool = field( default=False, metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."}, ) + mixture_of_depths: Optional[Literal["convert", "load"]] = field( + default=None, + metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."}, + ) use_unsloth: bool = field( default=False, metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."}, ) + visual_inputs: bool = field( + default=False, + metadata={"help": "Whethor or not to use multimodal LLM that accepts visual inputs."}, + ) + moe_aux_loss_coef: Optional[float] = field( + default=None, + metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."}, + ) disable_gradient_checkpointing: bool = field( default=False, metadata={"help": "Whether or not to disable gradient checkpointing."}, @@ -81,13 +106,17 @@ class ModelArguments: default=False, metadata={"help": "Whether or not to upcast the output of lm_head in fp32."}, ) + train_from_scratch: bool = field( + default=False, + metadata={"help": "Whether or not to randomly initialize the model weights."}, + ) infer_backend: Literal["huggingface", "vllm"] = field( default="huggingface", metadata={"help": "Backend engine used at inference."}, ) vllm_maxlen: int = field( default=2048, - metadata={"help": "Maximum input length of the vLLM engine."}, + metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."}, ) vllm_gpu_util: float = field( default=0.9, @@ -97,6 +126,14 @@ class ModelArguments: default=False, metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."}, ) + vllm_max_lora_rank: int = field( + default=8, + metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."}, + ) + vllm_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field( + default="auto", + metadata={"help": "Data type for model weights and activations in the vLLM engine."}, + ) offload_folder: str = field( default="offload", metadata={"help": "Path to offload model weights."}, @@ -121,6 +158,10 @@ class ModelArguments: default=1, metadata={"help": "The file shard size (in GB) of the exported model."}, ) + export_device: Literal["cpu", "auto"] = field( + default="cpu", + metadata={"help": "The device used in model export, use `auto` to accelerate exporting."}, + ) export_quantization_bit: Optional[int] = field( default=None, metadata={"help": "The number of bits to quantize the exported model."}, @@ -158,9 +199,15 @@ class ModelArguments: if self.split_special_tokens and self.use_fast_tokenizer: raise ValueError("`split_special_tokens` is only supported for slow tokenizers.") + if self.visual_inputs and self.use_unsloth: + raise ValueError("Unsloth does not support MLLM yet. Stay tuned.") + if self.adapter_name_or_path is not None: # support merging multiple lora weights self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")] + if self.new_special_tokens is not None: # support multiple special tokens + self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")] + assert self.quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization." assert self.export_quantization_bit in [None, 8, 4, 3, 2], "We only accept 2/3/4/8-bit quantization." diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/parser.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/parser.py similarity index 72% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/parser.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/parser.py index 7c9954b..ff1fbf5 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/hparams/parser.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/hparams/parser.py @@ -6,13 +6,14 @@ from typing import Any, Dict, Optional, Tuple import torch import transformers from transformers import HfArgumentParser, Seq2SeqTrainingArguments +from transformers.integrations import is_deepspeed_zero3_enabled from transformers.trainer_utils import get_last_checkpoint from transformers.utils import is_torch_bf16_gpu_available from transformers.utils.versions import require_version +from ..extras.constants import CHECKPOINT_NAMES from ..extras.logging import get_logger -from ..extras.misc import check_dependencies -from ..extras.packages import is_unsloth_available +from ..extras.misc import check_dependencies, get_current_device from .data_args import DataArguments from .evaluation_args import EvaluationArguments from .finetuning_args import FinetuningArguments @@ -64,10 +65,16 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin if model_args.adapter_name_or_path is not None and finetuning_args.finetuning_type != "lora": raise ValueError("Adapter is only valid for the LoRA method.") + if model_args.use_unsloth and is_deepspeed_zero3_enabled(): + raise ValueError("Unsloth is incompatible with DeepSpeed ZeRO-3.") + if model_args.quantization_bit is not None: if finetuning_args.finetuning_type != "lora": raise ValueError("Quantization is only compatible with the LoRA method.") + if model_args.resize_vocab: + raise ValueError("Cannot resize embedding layers of a quantized model.") + if model_args.adapter_name_or_path is not None and finetuning_args.create_new_adapter: raise ValueError("Cannot create new adapter upon a quantized model.") @@ -75,6 +82,35 @@ def _verify_model_args(model_args: "ModelArguments", finetuning_args: "Finetunin raise ValueError("Quantized model only accepts a single adapter. Merge them first.") +def _check_extra_dependencies( + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + training_args: Optional["Seq2SeqTrainingArguments"] = None, +) -> None: + if model_args.use_unsloth: + require_version("unsloth", "Please install unsloth: https://github.com/unslothai/unsloth") + + if model_args.mixture_of_depths is not None: + require_version("mixture-of-depth>=1.1.6", "To fix: pip install mixture-of-depth>=1.1.6") + + if model_args.infer_backend == "vllm": + require_version("vllm>=0.4.3", "To fix: pip install vllm>=0.4.3") + + if finetuning_args.use_galore: + require_version("galore_torch", "To fix: pip install galore_torch") + + if finetuning_args.use_badam: + require_version("badam", "To fix: pip install badam") + + if finetuning_args.plot_loss: + require_version("matplotlib", "To fix: pip install matplotlib") + + if training_args is not None and training_args.predict_with_generate: + require_version("jieba", "To fix: pip install jieba") + require_version("nltk", "To fix: pip install nltk") + require_version("rouge_chinese", "To fix: pip install rouge-chinese") + + def _parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: parser = HfArgumentParser(_TRAIN_ARGS) return _parse_args(parser, args) @@ -119,21 +155,24 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: if finetuning_args.stage == "ppo" and finetuning_args.reward_model_type == "lora" and model_args.use_unsloth: raise ValueError("Unsloth does not support lora reward model.") + if ( + finetuning_args.stage == "ppo" + and training_args.report_to + and training_args.report_to[0] not in ["wandb", "tensorboard"] + ): + raise ValueError("PPO only accepts wandb or tensorboard logger.") + if training_args.max_steps == -1 and data_args.streaming: raise ValueError("Please specify `max_steps` in streaming mode.") if training_args.do_train and training_args.predict_with_generate: raise ValueError("`predict_with_generate` cannot be set as True while training.") - if training_args.do_train and model_args.use_unsloth and not is_unsloth_available(): - raise ValueError("Unsloth was not installed: https://github.com/unslothai/unsloth") + if training_args.do_train and model_args.quantization_device_map == "auto": + raise ValueError("Cannot use device map for quantized models in training.") - if finetuning_args.use_dora: - if model_args.quantization_bit is not None: - require_version("peft>=0.9.1.dev0", "To fix: pip install git+https://github.com/huggingface/peft.git") - - if model_args.use_unsloth: - raise ValueError("Unsloth does not support DoRA.") + if finetuning_args.use_dora and model_args.use_unsloth: + raise ValueError("Unsloth does not support DoRA.") if finetuning_args.pure_bf16: if not is_torch_bf16_gpu_available(): @@ -149,18 +188,33 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: ): raise ValueError("Distributed training does not support layer-wise GaLore.") + if ( + finetuning_args.use_badam + and finetuning_args.badam_mode == "layer" + and training_args.parallel_mode.value == "distributed" + ): + raise ValueError("Layer-wise BAdam does not yet support distributed training, use ratio-wise BAdam.") + + if (finetuning_args.use_galore or finetuning_args.use_badam) and training_args.deepspeed is not None: + raise ValueError("GaLore and BAdam are incompatible with DeepSpeed yet.") + if model_args.infer_backend == "vllm": raise ValueError("vLLM backend is only available for API, CLI and Web.") + if model_args.visual_inputs and data_args.packing: + raise ValueError("Cannot use packing in MLLM fine-tuning.") + _verify_model_args(model_args, finetuning_args) + _check_extra_dependencies(model_args, finetuning_args, training_args) if ( training_args.do_train and finetuning_args.finetuning_type == "lora" + and model_args.quantization_bit is None and model_args.resize_vocab and finetuning_args.additional_target is None ): - logger.warning("Add token embeddings to `additional_target` to make the added tokens trainable.") + logger.warning("Remember to add embedding layers to `additional_target` to make the added tokens trainable.") if training_args.do_train and model_args.quantization_bit is not None and (not model_args.upcast_layernorm): logger.warning("We recommend enable `upcast_layernorm` in quantized training.") @@ -202,16 +256,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: and can_resume_from_checkpoint ): last_checkpoint = get_last_checkpoint(training_args.output_dir) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + if last_checkpoint is None and any( + os.path.isfile(os.path.join(training_args.output_dir, name)) for name in CHECKPOINT_NAMES + ): raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.") if last_checkpoint is not None: training_args.resume_from_checkpoint = last_checkpoint - logger.info( - "Resuming training from {}. Change `output_dir` or use `overwrite_output_dir` to avoid.".format( - training_args.resume_from_checkpoint - ) - ) + logger.info("Resuming training from {}.".format(training_args.resume_from_checkpoint)) + logger.info("Change `output_dir` or use `overwrite_output_dir` to avoid.") if ( finetuning_args.stage in ["rm", "ppo"] @@ -230,10 +283,11 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS: elif training_args.fp16: model_args.compute_dtype = torch.float16 + model_args.device_map = {"": get_current_device()} model_args.model_max_length = data_args.cutoff_len data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt" - # Log on each process the small summary: + # Log on each process the small summary logger.info( "Process rank: {}, device: {}, n_gpu: {}, distributed training: {}, compute dtype: {}".format( training_args.local_rank, @@ -261,18 +315,25 @@ def get_infer_args(args: Optional[Dict[str, Any]] = None) -> _INFER_CLS: if finetuning_args.stage != "sft": raise ValueError("vLLM engine only supports auto-regressive models.") - if model_args.adapter_name_or_path is not None: - raise ValueError("vLLM engine does not support LoRA adapters. Merge them first.") - if model_args.quantization_bit is not None: - raise ValueError("vLLM engine does not support quantization.") + raise ValueError("vLLM engine does not support bnb quantization (GPTQ and AWQ are supported).") if model_args.rope_scaling is not None: raise ValueError("vLLM engine does not support RoPE scaling.") - _verify_model_args(model_args, finetuning_args) + if model_args.adapter_name_or_path is not None and len(model_args.adapter_name_or_path) != 1: + raise ValueError("vLLM only accepts a single adapter. Merge them first.") - model_args.device_map = "auto" + if finetuning_args.stage == "rm" and model_args.visual_inputs: + raise ValueError("Reward server does not support MLLM yet. Stay tuned.") + + _verify_model_args(model_args, finetuning_args) + _check_extra_dependencies(model_args, finetuning_args) + + if model_args.export_dir is not None and model_args.export_device == "cpu": + model_args.device_map = {"": torch.device("cpu")} + else: + model_args.device_map = "auto" return model_args, data_args, finetuning_args, generating_args @@ -289,6 +350,7 @@ def get_eval_args(args: Optional[Dict[str, Any]] = None) -> _EVAL_CLS: raise ValueError("vLLM backend is only available for API, CLI and Web.") _verify_model_args(model_args, finetuning_args) + _check_extra_dependencies(model_args, finetuning_args) model_args.device_map = "auto" diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/launcher.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/launcher.py new file mode 100644 index 0000000..de154db --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/launcher.py @@ -0,0 +1,9 @@ +from llamafactory.train.tuner import run_exp + + +def launch(): + run_exp() + + +if __name__ == "__main__": + launch() diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/__init__.py new file mode 100644 index 0000000..9d23d59 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/__init__.py @@ -0,0 +1,12 @@ +from .loader import load_config, load_model, load_tokenizer +from .model_utils.misc import find_all_linear_modules +from .model_utils.valuehead import load_valuehead_params + + +__all__ = [ + "load_config", + "load_model", + "load_tokenizer", + "find_all_linear_modules", + "load_valuehead_params", +] diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/adapter.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/adapter.py new file mode 100644 index 0000000..f4e501a --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/adapter.py @@ -0,0 +1,275 @@ +import re +from typing import TYPE_CHECKING + +import torch +from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model +from transformers.integrations import is_deepspeed_zero3_enabled +from transformers.modeling_utils import is_fsdp_enabled + +from ..extras.logging import get_logger +from .model_utils.misc import find_all_linear_modules, find_expanded_modules +from .model_utils.quantization import QuantizationMethod +from .model_utils.unsloth import get_unsloth_peft_model, load_unsloth_peft_model + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + + from ..hparams import FinetuningArguments, ModelArguments + + +logger = get_logger(__name__) + + +def _setup_full_tuning( + model: "PreTrainedModel", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + cast_trainable_params_to_fp32: bool, +) -> None: + logger.info("Fine-tuning method: Full") + forbidden_modules = set() + if model_args.visual_inputs and finetuning_args.freeze_vision_tower: + forbidden_modules.add("vision_tower") + + if model_args.visual_inputs and finetuning_args.train_mm_proj_only: + forbidden_modules.add("language_model") + + for name, param in model.named_parameters(): + if not any(forbidden_module in name for forbidden_module in forbidden_modules): + if cast_trainable_params_to_fp32: + param.data = param.data.to(torch.float32) + else: + param.requires_grad_(False) + + +def _setup_freeze_tuning( + model: "PreTrainedModel", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + cast_trainable_params_to_fp32: bool, +) -> None: + logger.info("Fine-tuning method: Freeze") + if model_args.visual_inputs: + config = model.config.text_config + else: + config = model.config + + num_layers = ( + getattr(config, "num_hidden_layers", None) + or getattr(config, "num_layers", None) + or getattr(config, "n_layer", None) + ) + if not num_layers: + raise ValueError("Current model does not support freeze tuning.") + + if finetuning_args.use_llama_pro: + if num_layers % finetuning_args.freeze_trainable_layers != 0: + raise ValueError( + "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format( + num_layers, finetuning_args.freeze_trainable_layers + ) + ) + + stride = num_layers // finetuning_args.freeze_trainable_layers + trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) + elif finetuning_args.freeze_trainable_layers > 0: # fine-tuning the last n layers if num_layer_trainable > 0 + trainable_layer_ids = range(max(0, num_layers - finetuning_args.freeze_trainable_layers), num_layers) + else: # fine-tuning the first n layers if num_layer_trainable < 0 + trainable_layer_ids = range(min(-finetuning_args.freeze_trainable_layers, num_layers)) + + hidden_modules = set() + non_hidden_modules = set() + for name, _ in model.named_parameters(): + if ".0." in name: + hidden_modules.add(name.split(".0.")[-1].split(".")[0]) + elif ".1." in name: # MoD starts from layer 1 + hidden_modules.add(name.split(".1.")[-1].split(".")[0]) + + if re.search(r"\.\d+\.", name) is None: + non_hidden_modules.add(name.split(".")[-2]) + + trainable_layers = [] + for module_name in finetuning_args.freeze_trainable_modules: + if module_name != "all" and module_name not in hidden_modules: + raise ValueError( + "Module {} is not found, please choose from {}".format(module_name, ", ".join(hidden_modules)) + ) + + for idx in trainable_layer_ids: + trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else "")) + + if finetuning_args.freeze_extra_modules: + for module_name in finetuning_args.freeze_extra_modules: + if module_name not in non_hidden_modules: + raise ValueError( + "Module {} is not found, please choose from {}".format(module_name, ", ".join(non_hidden_modules)) + ) + + trainable_layers.append(module_name) + + forbidden_modules = set() + if model_args.visual_inputs and finetuning_args.freeze_vision_tower: + forbidden_modules.add("vision_tower") + + for name, param in model.named_parameters(): + if any(trainable_layer in name for trainable_layer in trainable_layers) and not any( + forbidden_module in name for forbidden_module in forbidden_modules + ): + if cast_trainable_params_to_fp32: + param.data = param.data.to(torch.float32) + else: + param.requires_grad_(False) + + logger.info("Set trainable layers: {}".format(",".join(trainable_layers))) + + +def _setup_lora_tuning( + config: "PretrainedConfig", + model: "PreTrainedModel", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool, + cast_trainable_params_to_fp32: bool, +) -> "PeftModel": + logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) + adapter_to_resume = None + + if model_args.adapter_name_or_path is not None: + is_mergeable = True + if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable + assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." + is_mergeable = False + + if is_deepspeed_zero3_enabled(): + assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." + is_mergeable = False + + if model_args.use_unsloth: + assert len(model_args.adapter_name_or_path) == 1, "Unsloth model only accepts a single adapter." + is_mergeable = False + + if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): + adapter_to_merge = model_args.adapter_name_or_path[:-1] + adapter_to_resume = model_args.adapter_name_or_path[-1] + else: + adapter_to_merge = model_args.adapter_name_or_path + + for adapter in adapter_to_merge: + model: "LoraModel" = PeftModel.from_pretrained(model, adapter, offload_folder=model_args.offload_folder) + model = model.merge_and_unload() + + if len(adapter_to_merge) > 0: + logger.info("Merged {} adapter(s).".format(len(adapter_to_merge))) + + if adapter_to_resume is not None: # resume lora training + if model_args.use_unsloth: + model = load_unsloth_peft_model(config, model_args, is_trainable=is_trainable) + else: + model = PeftModel.from_pretrained( + model, + adapter_to_resume, + is_trainable=is_trainable, + offload_folder=model_args.offload_folder, + ) + + if is_trainable and adapter_to_resume is None: # create new lora weights while training + if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": + target_modules = find_all_linear_modules(model, finetuning_args.freeze_vision_tower) + else: + target_modules = finetuning_args.lora_target + + if finetuning_args.use_llama_pro: + target_modules = find_expanded_modules(model, target_modules, finetuning_args.freeze_trainable_layers) + + if model_args.visual_inputs and finetuning_args.freeze_vision_tower: + target_modules = "^(?!.*vision_tower).*(?:{}).*".format("|".join(target_modules)) + + if ( + finetuning_args.use_dora + and getattr(model, "quantization_method", None) is not None + and getattr(model, "quantization_method", None) != QuantizationMethod.BITS_AND_BYTES + ): + raise ValueError("DoRA is not compatible with PTQ-quantized models.") + + if model_args.resize_vocab and finetuning_args.additional_target is None: + input_embeddings = model.get_input_embeddings() + output_embeddings = model.get_output_embeddings() + module_names = set() + for name, module in model.named_modules(): + if module in [input_embeddings, output_embeddings]: + module_names.add(name.split(".")[-1]) + + finetuning_args.additional_target = module_names + logger.warning("Vocab has been resized, add {} to trainable params.".format(",".join(module_names))) + + peft_kwargs = { + "r": finetuning_args.lora_rank, + "target_modules": target_modules, + "lora_alpha": finetuning_args.lora_alpha, + "lora_dropout": finetuning_args.lora_dropout, + "use_rslora": finetuning_args.use_rslora, + "modules_to_save": finetuning_args.additional_target, + } + + if model_args.use_unsloth: + model = get_unsloth_peft_model(model, model_args, peft_kwargs) + else: + lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + use_dora=finetuning_args.use_dora, + **peft_kwargs, + ) + model = get_peft_model(model, lora_config) + + if is_trainable and cast_trainable_params_to_fp32: + for param in filter(lambda p: p.requires_grad, model.parameters()): + param.data = param.data.to(torch.float32) + + if model_args.adapter_name_or_path is not None: + logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) + + return model + + +def init_adapter( + config: "PretrainedConfig", + model: "PreTrainedModel", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool, +) -> "PreTrainedModel": + r""" + Initializes the adapters. + + Support full-parameter, freeze and LoRA training. + + Note that the trainable parameters must be cast to float32. + """ + if (not is_trainable) and model_args.adapter_name_or_path is None: + logger.info("Adapter is not found at evaluation, load the base model.") + return model + + if finetuning_args.finetuning_type != "lora" and getattr(model, "quantization_method", None): + raise ValueError("You can only use lora for quantized models.") + + if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or finetuning_args.pure_bf16 or finetuning_args.use_badam: + logger.info("ZeRO3/FSDP/PureBF16/BAdam detected, remaining trainable params as their original precision.") + cast_trainable_params_to_fp32 = False + else: + logger.info("Upcasting trainable params to float32.") + cast_trainable_params_to_fp32 = True + + if is_trainable and finetuning_args.finetuning_type == "full": + _setup_full_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32) + + if is_trainable and finetuning_args.finetuning_type == "freeze": + _setup_freeze_tuning(model, model_args, finetuning_args, cast_trainable_params_to_fp32) + + if finetuning_args.finetuning_type == "lora": + model = _setup_lora_tuning( + config, model, model_args, finetuning_args, is_trainable, cast_trainable_params_to_fp32 + ) + + return model diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/loader.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/loader.py new file mode 100644 index 0000000..697a04e --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/loader.py @@ -0,0 +1,186 @@ +from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict + +from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq, AutoProcessor, AutoTokenizer +from trl import AutoModelForCausalLMWithValueHead + +from ..extras.logging import get_logger +from ..extras.misc import count_parameters, try_download_model_from_ms +from .adapter import init_adapter +from .model_utils.misc import register_autoclass +from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model +from .model_utils.unsloth import load_unsloth_pretrained_model +from .model_utils.valuehead import load_valuehead_params +from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer, ProcessorMixin + + from ..hparams import FinetuningArguments, ModelArguments + + +logger = get_logger(__name__) + + +class TokenizerModule(TypedDict): + tokenizer: "PreTrainedTokenizer" + processor: Optional["ProcessorMixin"] + + +def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: + r""" + Gets arguments to load config/tokenizer/model. + + Note: including inplace operation of model_args. + """ + model_args.model_name_or_path = try_download_model_from_ms(model_args) + return { + "trust_remote_code": True, + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "token": model_args.hf_hub_token, + } + + +def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule": + r""" + Loads pretrained tokenizer. + + Note: including inplace operation of model_args. + """ + init_kwargs = _get_init_kwargs(model_args) + try: + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + use_fast=model_args.use_fast_tokenizer, + split_special_tokens=model_args.split_special_tokens, + padding_side="right", + **init_kwargs, + ) + except ValueError: # try the fast one + tokenizer = AutoTokenizer.from_pretrained( + model_args.model_name_or_path, + use_fast=True, + padding_side="right", + **init_kwargs, + ) + + if model_args.new_special_tokens is not None: + num_added_tokens = tokenizer.add_special_tokens( + dict(additional_special_tokens=model_args.new_special_tokens), + replace_additional_special_tokens=False, + ) + logger.info("Add {} to special tokens.".format(",".join(model_args.new_special_tokens))) + if num_added_tokens > 0 and not model_args.resize_vocab: + model_args.resize_vocab = True + logger.warning("New tokens have been added, changed `resize_vocab` to True.") + + patch_tokenizer(tokenizer) + + if model_args.visual_inputs: + try: + processor = AutoProcessor.from_pretrained(model_args.model_name_or_path, **init_kwargs) + setattr(processor, "tokenizer", tokenizer) + except Exception: + raise ValueError( + "This multimodal LLM is not supported.\n" + "Download LLaVA-1.5 models from: https://huggingface.co/llava-hf\n" + "Download Yi-VL models from: https://huggingface.co/BUAADreamer" + ) + else: + processor = None + + return {"tokenizer": tokenizer, "processor": processor} + + +def load_config(model_args: "ModelArguments") -> "PretrainedConfig": + r""" + Loads model config. + """ + init_kwargs = _get_init_kwargs(model_args) + return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) + + +def load_model( + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + finetuning_args: "FinetuningArguments", + is_trainable: bool = False, + add_valuehead: bool = False, +) -> "PreTrainedModel": + r""" + Loads pretrained model. + """ + init_kwargs = _get_init_kwargs(model_args) + config = load_config(model_args) + patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) + + model = None + lazy_load = False + if model_args.use_unsloth: + if model_args.adapter_name_or_path is not None: + lazy_load = True + elif is_trainable: + model = load_unsloth_pretrained_model(config, model_args) + + if model is None and not lazy_load: + init_kwargs["config"] = config + init_kwargs["pretrained_model_name_or_path"] = model_args.model_name_or_path + + if model_args.mixture_of_depths == "load": + model = load_mod_pretrained_model(**init_kwargs) + elif model_args.visual_inputs: + model = AutoModelForVision2Seq.from_pretrained(**init_kwargs) + elif model_args.train_from_scratch: + model = AutoModelForCausalLM.from_config(config) + else: + model = AutoModelForCausalLM.from_pretrained(**init_kwargs) + + if model_args.mixture_of_depths == "convert": + model = convert_pretrained_model_to_mod(model, config, model_args) + + if not lazy_load: + patch_model(model, tokenizer, model_args, is_trainable, add_valuehead) + register_autoclass(config, model, tokenizer) + + model = init_adapter(config, model, model_args, finetuning_args, is_trainable) + + if add_valuehead: + model = AutoModelForCausalLMWithValueHead.from_pretrained(model) + patch_valuehead_model(model) + + if model_args.adapter_name_or_path is not None: + vhead_path = model_args.adapter_name_or_path[-1] + else: + vhead_path = model_args.model_name_or_path + + vhead_params = load_valuehead_params(vhead_path, model_args) + if vhead_params is not None: + model.load_state_dict(vhead_params, strict=False) + logger.info("Loaded valuehead from checkpoint: {}".format(vhead_path)) + + if not is_trainable: + model.requires_grad_(False) + model.eval() + else: + model.train() + + trainable_params, all_param = count_parameters(model) + if is_trainable: + param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( + trainable_params, all_param, 100 * trainable_params / all_param + ) + else: + param_stats = "all params: {:d}".format(all_param) + + logger.info(param_stats) + + if model_args.print_param_status: + for name, param in model.named_parameters(): + print( + "name: {}, dtype: {}, device: {}, trainable: {}".format( + name, param.dtype, param.device, param.requires_grad + ) + ) + + return model diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/attention.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/attention.py new file mode 100644 index 0000000..b52ddc8 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/attention.py @@ -0,0 +1,55 @@ +from typing import TYPE_CHECKING + +from ...extras.logging import get_logger +from ...extras.packages import is_flash_attn2_available, is_sdpa_available + + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +def configure_attn_implementation(config: "PretrainedConfig", model_args: "ModelArguments") -> None: + if model_args.flash_attn == "auto": + return + + elif model_args.flash_attn == "off": + requested_attn_implementation = "eager" + + elif model_args.flash_attn == "sdpa": + if not is_sdpa_available(): + logger.warning("torch>=2.1.1 is required for SDPA attention.") + return + + requested_attn_implementation = "sdpa" + elif model_args.flash_attn == "fa2": + if not is_flash_attn2_available(): + logger.warning("FlashAttention-2 is not installed.") + return + + requested_attn_implementation = "flash_attention_2" + else: + raise NotImplementedError("Unknown attention type: {}".format(model_args.flash_attn)) + + if getattr(config, "model_type", None) == "internlm2": # special case for custom models + setattr(config, "attn_implementation", requested_attn_implementation) + else: + setattr(config, "_attn_implementation", requested_attn_implementation) + + +def print_attn_implementation(config: "PretrainedConfig") -> None: + if getattr(config, "model_type", None) == "internlm2": # special case for custom models + attn_implementation = getattr(config, "attn_implementation", None) + else: + attn_implementation = getattr(config, "_attn_implementation", None) + + if attn_implementation == "flash_attention_2": + logger.info("Using FlashAttention-2 for faster training and inference.") + elif attn_implementation == "sdpa": + logger.info("Using torch SDPA for faster training and inference.") + else: + logger.info("Using vanilla attention implementation.") diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/checkpointing.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/checkpointing.py new file mode 100644 index 0000000..e0657be --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/checkpointing.py @@ -0,0 +1,94 @@ +import inspect +from functools import partial +from types import MethodType +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple + +import torch + +from ...extras.constants import LAYERNORM_NAMES +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PreTrainedModel + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +def _gradient_checkpointing_enable( + self: "PreTrainedModel", gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None +) -> None: + r""" + Activates gradient checkpointing for the current model. + + Modification of the original method to enable gradient checkpointing for block-wise optimizer. + """ + from torch.utils.checkpoint import checkpoint + + if not self.supports_gradient_checkpointing: + raise ValueError("{} does not support gradient checkpointing.".format(self.__class__.__name__)) + + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {"use_reentrant": True} + + gradient_checkpointing_func = partial(checkpoint, **gradient_checkpointing_kwargs) + + def custom_gradient_checkpointing_func(func, *args, **kwargs): + module: "torch.nn.Module" = func.__self__ + + if any(param.requires_grad for param in module.parameters()): + for arg in args: + if torch.is_tensor(arg) and torch.is_floating_point(arg): + arg.requires_grad_(True) + + return gradient_checkpointing_func(func, *args, **kwargs) + + if "value" in inspect.signature(self._set_gradient_checkpointing).parameters: # old GC format + self.apply(partial(self._set_gradient_checkpointing, value=True)) + self.enable_input_require_grads() + logger.warning("You are using the old GC format, some features (e.g. BAdam) will be invalid.") + else: # have already enabled input require gradients + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=custom_gradient_checkpointing_func) + + +def _fp32_forward_post_hook( + module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor" +) -> "torch.Tensor": + return output.to(torch.float32) + + +def prepare_model_for_training( + model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head" +) -> None: + r""" + Includes: + (1) cast the layernorm in fp32 + (2) make output embedding layer require grads + (3) add the upcasting of the lm_head in fp32 + Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72 + """ + if model_args.upcast_layernorm: + logger.info("Upcasting layernorm weights in float32.") + for name, param in model.named_parameters(): + if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES): + param.data = param.data.to(torch.float32) + + if not model_args.disable_gradient_checkpointing: + if not getattr(model, "supports_gradient_checkpointing", False): + logger.warning("Current model does not support gradient checkpointing.") + else: + # use_reentrant=False might increase VRAM usage (have not been empirically verified yet) + # According to: https://github.com/huggingface/transformers/issues/28339 + model.gradient_checkpointing_enable = MethodType(_gradient_checkpointing_enable, model) + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True}) + setattr(model.config, "use_cache", False) # turn off when gradient checkpointing is enabled + logger.info("Gradient checkpointing enabled.") + + if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output: + logger.info("Upcasting lm_head outputs in float32.") + output_layer = getattr(model, output_layer_name) + if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32: + output_layer.register_forward_hook(_fp32_forward_post_hook) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/embedding.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/embedding.py new file mode 100644 index 0000000..3d9278e --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/embedding.py @@ -0,0 +1,58 @@ +import math +from contextlib import nullcontext +from typing import TYPE_CHECKING + +import torch +from transformers.integrations import is_deepspeed_zero3_enabled + +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PreTrainedModel, PreTrainedTokenizer + + +logger = get_logger(__name__) + + +def _noisy_mean_initialization(embed_weight: "torch.Tensor", num_new_tokens: int) -> None: + embedding_dim = embed_weight.size(1) + avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) + noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) + noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) + embed_weight[-num_new_tokens:] = avg_weight + noise_weight + + +def resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None: + r""" + Resize token embeddings. + """ + if is_deepspeed_zero3_enabled(): + import deepspeed # type: ignore + + params = [model.get_input_embeddings().weight] + if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings: + params.append(model.get_output_embeddings().weight) + + context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0) + else: + context_maybe_zero3 = nullcontext() + + with context_maybe_zero3: + current_embedding_size = model.get_input_embeddings().weight.size(0) + + if len(tokenizer) > current_embedding_size: + if getattr(model, "quantization_method", None): + raise ValueError("Cannot resize embedding layers of a quantized model.") + + if not isinstance(model.get_output_embeddings(), torch.nn.Linear): + raise ValueError("Current model does not support resizing embedding layers.") + + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64) + with context_maybe_zero3: + new_embedding_size = model.get_input_embeddings().weight.size(0) + num_new_tokens = new_embedding_size - current_embedding_size + _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) + _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) + + logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/longlora.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/longlora.py new file mode 100644 index 0000000..c8dc52f --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/longlora.py @@ -0,0 +1,323 @@ +import math +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import torch.nn as nn +from transformers.models.llama.modeling_llama import ( + Cache, + LlamaAttention, + LlamaFlashAttention2, + LlamaSdpaAttention, + apply_rotary_pos_emb, + repeat_kv, +) +from transformers.utils import logging +from transformers.utils.versions import require_version + +from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + from ...hparams import ModelArguments + + +logger = logging.get_logger(__name__) + + +# Modified from: +# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py +def llama_attention_forward( + self: "LlamaAttention", + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional["Cache"] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states: "torch.Tensor" = self.q_proj(hidden_states) + key_states: "torch.Tensor" = self.k_proj(hidden_states) + value_states: "torch.Tensor" = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift + groupsz = int(q_len * getattr(self.config, "group_size_ratio")) + assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) + num_groups = q_len // groupsz + + def shift(state: torch.Tensor) -> torch.Tensor: + state = state.transpose(1, 2) # output: (bsz, seq_len, n_heads, head_dim) + state = torch.cat( + (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)), + dim=2, + ) + return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) + if attention_mask is not None: + attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) # (bsz, :, seq_len, :) or (bsz * n_group, :, groupsz, :) + attn_output = attn_output.transpose(1, 2).contiguous() + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift back + attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) + attn_output = torch.cat( + ( + attn_output[:, :, : self.num_heads // 2], + attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1), + ) + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from: +# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py +def llama_flash_attention_2_forward( + self: "LlamaFlashAttention2", + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional["Cache"] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # LlamaFlashAttention2 attention does not support output_attentions + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states: "torch.Tensor" = self.q_proj(hidden_states) + key_states: "torch.Tensor" = self.k_proj(hidden_states) + value_states: "torch.Tensor" = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once("The input hidden states seems to be silently casted in float32.") + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift + groupsz = int(q_len * getattr(self.config, "group_size_ratio")) + assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) + num_groups = q_len // groupsz + + def shift(state: torch.Tensor) -> torch.Tensor: + state = torch.cat( + (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)), + dim=2, + ) + return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim) + + query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) + if attention_mask is not None: + attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1) + else: + groupsz = q_len + + attn_output: torch.Tensor = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, groupsz, dropout=dropout_rate + ) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift back + attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) + attn_output = torch.cat( + ( + attn_output[:, :, : self.num_heads // 2], + attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1), + ) + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from: +# https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py +def llama_sdpa_attention_forward( + self: "LlamaSdpaAttention", + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional["Cache"] = None, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + logger.warning_once("SDPA does not support `output_attentions=True`. Falling back to the vanilla attention") + return llama_attention_forward( + self, + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + cache_position=cache_position, + **kwargs, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states: "torch.Tensor" = self.q_proj(hidden_states) + key_states: "torch.Tensor" = self.k_proj(hidden_states) + value_states: "torch.Tensor" = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift + groupsz = int(q_len * getattr(self.config, "group_size_ratio")) + assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) + num_groups = q_len // groupsz + + def shift(state: torch.Tensor) -> torch.Tensor: + state = state.transpose(1, 2) # output: (bsz, seq_len, n_heads, head_dim) + state = torch.cat( + (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)), + dim=2, + ) + return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2) + + query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) + if attention_mask is not None: + attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=causal_mask is None and q_len > 1, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + + if getattr(self.config, "group_size_ratio", None) and self.training: # shift back + attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) + attn_output = torch.cat( + ( + attn_output[:, :, : self.num_heads // 2], + attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1), + ) + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +def _apply_llama_patch() -> None: + require_version("transformers==4.40.2", "To fix: pip install transformers==4.40.2") + LlamaAttention.forward = llama_attention_forward + LlamaFlashAttention2.forward = llama_flash_attention_2_forward + LlamaSdpaAttention.forward = llama_sdpa_attention_forward + + +def configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: + if not is_trainable or not model_args.shift_attn: + return + + logger = get_logger(__name__) + + if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN: + setattr(config, "group_size_ratio", 0.25) + _apply_llama_patch() + logger.info("Using shift short attention with group_size_ratio=1/4.") + else: + logger.warning("Current model does not support shift short attention.") diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/misc.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/misc.py new file mode 100644 index 0000000..4851bd2 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/misc.py @@ -0,0 +1,74 @@ +from typing import TYPE_CHECKING, List + +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer + + +logger = get_logger(__name__) + + +def find_all_linear_modules(model: "PreTrainedModel", freeze_vision_tower: bool) -> List[str]: + r""" + Finds all available modules to apply lora or galore. + """ + forbidden_modules = {"lm_head"} + + if model.config.model_type == "chatglm": + forbidden_modules.add("output_layer") + elif model.config.model_type == "internlm2": + forbidden_modules.add("output") + elif model.config.model_type in ["llava", "paligemma"]: + forbidden_modules.add("multi_modal_projector") + + if freeze_vision_tower: + forbidden_modules.add("vision_tower") + + module_names = set() + for name, module in model.named_modules(): + if any(forbidden_module in name for forbidden_module in forbidden_modules): + continue + + if "Linear" in module.__class__.__name__ and "Embedding" not in module.__class__.__name__: + module_names.add(name.split(".")[-1]) + + logger.info("Found linear modules: {}".format(",".join(module_names))) + return list(module_names) + + +def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], num_layer_trainable: int) -> List[str]: + r""" + Finds the modules in the expanded blocks to apply lora. + """ + num_layers = getattr(model.config, "num_hidden_layers", None) + if not num_layers: + raise ValueError("Model was not supported.") + + if num_layers % num_layer_trainable != 0: + raise ValueError( + "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(num_layers, num_layer_trainable) + ) + + stride = num_layers // num_layer_trainable + trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) + trainable_layers = [".{:d}.".format(idx) for idx in trainable_layer_ids] + module_names = [] + for name, _ in model.named_modules(): + if any(target_module in name for target_module in target_modules) and any( + trainable_layer in name for trainable_layer in trainable_layers + ): + module_names.append(name) + + logger.info("Apply lora to layers: {}".format(",".join(map(str, trainable_layer_ids)))) + return module_names + + +def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"): + if "AutoConfig" in getattr(config, "auto_map", {}): + config.__class__.register_for_auto_class() + if "AutoModelForCausalLM" in getattr(config, "auto_map", {}): + model.__class__.register_for_auto_class() + if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}): + tokenizer.__class__.register_for_auto_class() diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/mod.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/mod.py new file mode 100644 index 0000000..5708a1a --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/mod.py @@ -0,0 +1,28 @@ +from typing import TYPE_CHECKING + +from ...extras.constants import MOD_SUPPORTED_MODELS + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + + from ...hparams import ModelArguments + + +def load_mod_pretrained_model(**init_kwargs) -> "PreTrainedModel": + from MoD import AutoMoDModelForCausalLM + + return AutoMoDModelForCausalLM.from_pretrained(**init_kwargs) + + +def convert_pretrained_model_to_mod( + model: "PreTrainedModel", config: "PretrainedConfig", model_args: "ModelArguments" +) -> "PreTrainedModel": + from MoD import apply_mod_to_hf + + if getattr(config, "model_type", None) not in MOD_SUPPORTED_MODELS: + raise ValueError("Current model is not supported by mixture-of-depth.") + + model = apply_mod_to_hf(model) + model = model.to(model_args.compute_dtype) + return model diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/moe.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/moe.py new file mode 100644 index 0000000..e554e45 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/moe.py @@ -0,0 +1,61 @@ +from typing import TYPE_CHECKING + +from transformers.integrations import is_deepspeed_zero3_enabled +from transformers.utils.versions import require_version + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + + from ...hparams import ModelArguments + + +def add_z3_leaf_module(model: "PreTrainedModel") -> None: + r""" + Sets module as a leaf module to skip partitioning in deepspeed zero3. + """ + if not is_deepspeed_zero3_enabled(): + return + + require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0") + from deepspeed.utils import set_z3_leaf_modules # type: ignore + + if getattr(model.config, "model_type", None) == "dbrx": + from transformers.models.dbrx.modeling_dbrx import DbrxFFN + + set_z3_leaf_modules(model, [DbrxFFN]) + + if getattr(model.config, "model_type", None) == "jamba": + from transformers.models.jamba.modeling_jamba import JambaSparseMoeBlock + + set_z3_leaf_modules(model, [JambaSparseMoeBlock]) + + if getattr(model.config, "model_type", None) == "jetmoe": + from transformers.models.jetmoe.modeling_jetmoe import JetMoeMoA, JetMoeMoE + + set_z3_leaf_modules(model, [JetMoeMoA, JetMoeMoE]) + + if getattr(model.config, "model_type", None) == "mixtral": + from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock + + set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) + + if getattr(model.config, "model_type", None) == "qwen2moe": + from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock + + set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock]) + + +def configure_moe(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: + if model_args.moe_aux_loss_coef is not None: + if getattr(config, "model_type", None) in ["jamba", "mixtral", "qwen2_moe"]: + setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef) + + elif getattr(config, "model_type", None) == "deepseek": + setattr(config, "aux_loss_alpha", model_args.moe_aux_loss_coef) + + elif getattr(config, "model_type", None) == "jetmoe": + setattr(config, "aux_loss_coef", model_args.moe_aux_loss_coef) + + if getattr(config, "model_type", None) in ["dbrx", "jamba", "jetmoe", "mixtral", "qwen2_moe"]: + setattr(config, "output_router_logits", is_trainable) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/quantization.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/quantization.py new file mode 100644 index 0000000..02a54f0 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/quantization.py @@ -0,0 +1,150 @@ +import os +import random +from enum import Enum, unique +from typing import TYPE_CHECKING, Any, Dict, List + +import torch +from datasets import load_dataset +from transformers import BitsAndBytesConfig, GPTQConfig +from transformers.integrations import is_deepspeed_zero3_enabled +from transformers.modeling_utils import is_fsdp_enabled +from transformers.utils.versions import require_version + +from ...extras.constants import FILEEXT2TYPE +from ...extras.logging import get_logger +from ...extras.misc import get_current_device + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedTokenizer + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +@unique +class QuantizationMethod(str, Enum): + r""" + Borrowed from `transformers.utils.quantization_config.QuantizationMethod`. + """ + + BITS_AND_BYTES = "bitsandbytes" + GPTQ = "gptq" + AWQ = "awq" + AQLM = "aqlm" + QUANTO = "quanto" + EETQ = "eetq" + HQQ = "hqq" + + +def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]: + r""" + Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133 + TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600 + """ + if os.path.isfile(model_args.export_quantization_dataset): + data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None) + data_files = model_args.export_quantization_dataset + else: + data_path = model_args.export_quantization_dataset + data_files = None + + dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir) + maxlen = model_args.export_quantization_maxlen + + samples = [] + for _ in range(model_args.export_quantization_nsamples): + while True: + sample_idx = random.randint(0, len(dataset) - 1) + sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt") + if sample["input_ids"].size(1) >= maxlen: + break # TODO: fix large maxlen + + word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1) + input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen] + samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True)) + + return samples + + +def configure_quantization( + config: "PretrainedConfig", + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + init_kwargs: Dict[str, Any], +) -> None: + r""" + Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training) + """ + if getattr(config, "quantization_config", None): # ptq + if is_deepspeed_zero3_enabled(): + raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantized models.") + + if model_args.quantization_device_map != "auto": + init_kwargs["device_map"] = {"": get_current_device()} + + quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None) + quant_method = quantization_config.get("quant_method", "") + + if quant_method == QuantizationMethod.GPTQ: + require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0") + quantization_config.pop("disable_exllama", None) # remove deprecated args + quantization_config["use_exllama"] = False # disable exllama + + if quant_method == QuantizationMethod.AWQ: + require_version("autoawq", "To fix: pip install autoawq") + + if quant_method == QuantizationMethod.AQLM: + require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0") + require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0") + quantization_config["bits"] = 2 + + quant_bits = quantization_config.get("bits", "?") + logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper())) + + elif model_args.export_quantization_bit is not None: # auto-gptq + require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0") + require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0") + from accelerate.utils import get_max_memory + + if getattr(config, "model_type", None) == "chatglm": + raise ValueError("ChatGLM model is not supported.") + + init_kwargs["quantization_config"] = GPTQConfig( + bits=model_args.export_quantization_bit, + tokenizer=tokenizer, + dataset=_get_quantization_dataset(tokenizer, model_args), + ) + init_kwargs["device_map"] = "auto" + init_kwargs["max_memory"] = get_max_memory() + logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit)) + + elif model_args.quantization_bit is not None: # bnb + if model_args.quantization_bit == 8: + require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0") + init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) + + elif model_args.quantization_bit == 4: + require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0") + init_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=model_args.compute_dtype, + bnb_4bit_use_double_quant=model_args.double_quantization, + bnb_4bit_quant_type=model_args.quantization_type, + bnb_4bit_quant_storage=model_args.compute_dtype, # crucial for fsdp+qlora + ) + + if is_deepspeed_zero3_enabled() or is_fsdp_enabled() or model_args.quantization_device_map == "auto": + if model_args.quantization_bit != 4: + raise ValueError("Only 4-bit quantized model can use auto device map.") + + require_version("transformers>=4.39.0", "To fix: pip install transformers>=4.39.0") + require_version("accelerate>=0.28.0", "To fix: pip install accelerate>=0.28.0") + require_version("bitsandbytes>=0.43.0", "To fix: pip install bitsandbytes>=0.43.0") + init_kwargs["torch_dtype"] = model_args.compute_dtype # fsdp+qlora requires same dtype + else: + init_kwargs["device_map"] = {"": get_current_device()} + + logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/rope.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/rope.py new file mode 100644 index 0000000..93ab892 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/rope.py @@ -0,0 +1,47 @@ +import math +from typing import TYPE_CHECKING + +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +def configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: + if model_args.rope_scaling is None: + return + + if not hasattr(config, "rope_scaling"): + logger.warning("Current model does not support RoPE scaling.") + return + + if is_trainable: + if model_args.rope_scaling == "dynamic": + logger.warning( + "Dynamic NTK scaling may not work well with fine-tuning. " + "See: https://github.com/huggingface/transformers/pull/24653" + ) + + current_max_length = getattr(config, "max_position_embeddings", None) + if current_max_length and model_args.model_max_length > current_max_length: + logger.info( + "Enlarge max model length from {} to {}.".format(current_max_length, model_args.model_max_length) + ) + setattr(config, "max_position_embeddings", model_args.model_max_length) + scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length)) + else: + logger.warning("Input length is smaller than max length. Consider increase input length.") + scaling_factor = 1.0 + else: + scaling_factor = 2.0 + + setattr(config, "rope_scaling", {"type": model_args.rope_scaling, "factor": scaling_factor}) + logger.info( + "Using {} scaling strategy and setting scaling factor to {}".format(model_args.rope_scaling, scaling_factor) + ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/unsloth.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/unsloth.py new file mode 100644 index 0000000..8a16409 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/unsloth.py @@ -0,0 +1,88 @@ +from typing import TYPE_CHECKING, Any, Dict, Optional + +from ...extras.logging import get_logger +from ...extras.misc import get_current_device + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedModel + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +def _get_unsloth_kwargs( + config: "PretrainedConfig", model_name_or_path: str, model_args: "ModelArguments" +) -> Dict[str, Any]: + return { + "model_name": model_name_or_path, + "max_seq_length": model_args.model_max_length or 4096, + "dtype": model_args.compute_dtype, + "load_in_4bit": model_args.quantization_bit == 4, + "token": model_args.hf_hub_token, + "device_map": {"": get_current_device()}, + "rope_scaling": getattr(config, "rope_scaling", None), + "fix_tokenizer": False, + "trust_remote_code": True, + "use_gradient_checkpointing": "unsloth", + } + + +def load_unsloth_pretrained_model( + config: "PretrainedConfig", model_args: "ModelArguments" +) -> Optional["PreTrainedModel"]: + r""" + Optionally loads pretrained model with unsloth. Used in training. + """ + from unsloth import FastLanguageModel + + unsloth_kwargs = _get_unsloth_kwargs(config, model_args.model_name_or_path, model_args) + try: + model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs) + except NotImplementedError: + logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None))) + model = None + model_args.use_unsloth = False + + return model + + +def get_unsloth_peft_model( + model: "PreTrainedModel", model_args: "ModelArguments", peft_kwargs: Dict[str, Any] +) -> "PreTrainedModel": + r""" + Gets the peft model for the pretrained model with unsloth. Used in training. + """ + from unsloth import FastLanguageModel + + unsloth_peft_kwargs = { + "model": model, + "max_seq_length": model_args.model_max_length, + "use_gradient_checkpointing": "unsloth", + } + return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) + + +def load_unsloth_peft_model( + config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool +) -> "PreTrainedModel": + r""" + Loads peft model with unsloth. Used in both training and inference. + """ + from unsloth import FastLanguageModel + + unsloth_kwargs = _get_unsloth_kwargs(config, model_args.adapter_name_or_path[0], model_args) + try: + if not is_trainable: + unsloth_kwargs["use_gradient_checkpointing"] = False + + model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs) + except NotImplementedError: + raise ValueError("Unsloth does not support model type {}.".format(getattr(config, "model_type", None))) + + if not is_trainable: + FastLanguageModel.for_inference(model) + + return model diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/valuehead.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/valuehead.py new file mode 100644 index 0000000..6433368 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/valuehead.py @@ -0,0 +1,59 @@ +from typing import TYPE_CHECKING, Dict + +import torch +from transformers.utils import cached_file + +from ...extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import PreTrainedModel + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]: + r""" + Loads value head parameters from Hugging Face Hub or local disk. + + Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`. + """ + kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token} + err_text = "" + + try: + from safetensors import safe_open + + vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs) + with safe_open(vhead_file, framework="pt", device="cpu") as f: + return {key: f.get_tensor(key) for key in f.keys()} + except Exception as err: + err_text = str(err) + + try: + vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs) + return torch.load(vhead_file, map_location="cpu") + except Exception as err: + err_text = str(err) + + logger.info("Provided path ({}) does not contain value head weights: {}.".format(path_or_repo_id, err_text)) + logger.info("Ignore the above message if you are not resuming the training of a value head model.") + return None + + +def prepare_valuehead_model(model: "PreTrainedModel") -> None: + if getattr(model.config, "model_type", None) == "llava": + setattr(model, "lm_head", model.language_model.get_output_embeddings()) + setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) + + if getattr(model.config, "model_type", None) == "chatglm": + setattr(model, "lm_head", model.transformer.output_layer) + setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) + + if getattr(model.config, "model_type", None) == "internlm2": + setattr(model, "lm_head", model.output) + setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/visual.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/visual.py new file mode 100644 index 0000000..c8260b7 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/model_utils/visual.py @@ -0,0 +1,84 @@ +from typing import TYPE_CHECKING, Tuple + +import torch +import transformers.models +from transformers.activations import ACT2FN + +from ...extras.logging import get_logger + + +if TYPE_CHECKING: + from transformers import LlavaConfig, PretrainedConfig, PreTrainedModel + + from ...hparams import ModelArguments + + +logger = get_logger(__name__) + + +class LlavaMultiModalProjectorForYiVL(torch.nn.Module): + def __init__(self, config: "LlavaConfig") -> None: + super().__init__() + + self.config = config + if config is None: + return + + self.linear_1 = torch.nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_2 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True) + self.linear_3 = torch.nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True) + self.linear_4 = torch.nn.LayerNorm(config.text_config.hidden_size, bias=True) + self.act = ACT2FN[config.projector_hidden_act] + + def forward(self, image_features: "torch.Tensor") -> "torch.Tensor": + hidden_states = self.linear_1(image_features) + hidden_states = self.linear_2(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.linear_3(hidden_states) + hidden_states = self.linear_4(hidden_states) + if hidden_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.linear_1.weight.dtype + + logger.warning_once("The hidden states seems to be silently casted in float32.") + hidden_states = hidden_states.to(target_dtype) + + return hidden_states + + +class LlavaMultiModalProjectorForYiVLForVLLM(LlavaMultiModalProjectorForYiVL): + def __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str) -> None: + super().__init__(config=None) + + self.linear_1 = torch.nn.Linear(vision_hidden_size, text_hidden_size, bias=True) + self.linear_2 = torch.nn.LayerNorm(text_hidden_size, bias=True) + self.linear_3 = torch.nn.Linear(text_hidden_size, text_hidden_size, bias=True) + self.linear_4 = torch.nn.LayerNorm(text_hidden_size, bias=True) + self.act = ACT2FN[projector_hidden_act] + + +def autocast_projector_dtype( + model: "PreTrainedModel", model_args: "ModelArguments", mm_projector_name: str = "multi_modal_projector" +) -> None: + def _mm_projector_forward_post_hook( + module: "torch.nn.Module", args: Tuple["torch.Tensor"], output: "torch.Tensor" + ) -> "torch.Tensor": + return output.to(model_args.compute_dtype) + + if hasattr(model, mm_projector_name) and getattr(model, "quantization_method", None): + logger.info("Casting multimodal projector outputs in {}.".format(model_args.compute_dtype)) + mm_projector: "torch.nn.Module" = getattr(model, mm_projector_name) + mm_projector.register_forward_hook(_mm_projector_forward_post_hook) + + +def configure_visual_model(config: "PretrainedConfig") -> None: + if getattr(config, "model_type", None) == "llava": # required for ds zero3 and valuehead models + setattr(config, "hidden_size", getattr(config.text_config, "hidden_size", None)) + + if getattr(config, "is_yi_vl_derived_model", None): + logger.info("Detected Yi-VL model, applying projector patch.") + transformers.models.llava.modeling_llava.LlavaMultiModalProjector = LlavaMultiModalProjectorForYiVL diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/patcher.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/patcher.py new file mode 100644 index 0000000..87c9231 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/model/patcher.py @@ -0,0 +1,143 @@ +import os +from types import MethodType +from typing import TYPE_CHECKING, Any, Dict + +import torch +from peft import PeftModel +from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available +from transformers.integrations import is_deepspeed_zero3_enabled +from transformers.modeling_utils import is_fsdp_enabled + +from ..extras.logging import get_logger +from ..extras.misc import infer_optim_dtype +from .model_utils.attention import configure_attn_implementation, print_attn_implementation +from .model_utils.checkpointing import prepare_model_for_training +from .model_utils.embedding import resize_embedding_layer +from .model_utils.longlora import configure_longlora +from .model_utils.moe import add_z3_leaf_module, configure_moe +from .model_utils.quantization import configure_quantization +from .model_utils.rope import configure_rope +from .model_utils.valuehead import prepare_valuehead_model +from .model_utils.visual import autocast_projector_dtype, configure_visual_model + + +if TYPE_CHECKING: + from transformers import PretrainedConfig, PreTrainedTokenizer + from trl import AutoModelForCausalLMWithValueHead + + from ..hparams import ModelArguments + + +logger = get_logger(__name__) + + +def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None: + if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__): + tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer) + + +def patch_config( + config: "PretrainedConfig", + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + init_kwargs: Dict[str, Any], + is_trainable: bool, +) -> None: + if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32 + model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) + + if is_torch_npu_available(): + use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"] + torch.npu.set_compile_mode(jit_compile=use_jit_compile) + + configure_attn_implementation(config, model_args) + configure_rope(config, model_args, is_trainable) + configure_longlora(config, model_args, is_trainable) + configure_quantization(config, tokenizer, model_args, init_kwargs) + configure_moe(config, model_args, is_trainable) + configure_visual_model(config) + + if model_args.use_cache and not is_trainable: + setattr(config, "use_cache", True) + logger.info("Using KV cache for faster generation.") + + if getattr(config, "model_type", None) == "qwen": + setattr(config, "use_flash_attn", model_args.flash_attn == "fa2") + for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]: + setattr(config, dtype_name, model_args.compute_dtype == dtype) + + if getattr(config, "model_type", None) == "qwen2" and is_trainable and model_args.flash_attn == "fa2": + setattr(config, "use_cache", False) # qwen2 does not support use_cache when using flash attn + + # deepspeed zero3 is not compatible with low_cpu_mem_usage + init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage and (not is_deepspeed_zero3_enabled()) + + if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled(): # cast dtype and device if not use zero3 or fsdp + init_kwargs["torch_dtype"] = model_args.compute_dtype + + if init_kwargs["low_cpu_mem_usage"]: # device map requires low_cpu_mem_usage=True + if "device_map" not in init_kwargs and model_args.device_map: + init_kwargs["device_map"] = model_args.device_map + + if init_kwargs["device_map"] == "auto": + init_kwargs["offload_folder"] = model_args.offload_folder + + +def patch_model( + model: "PreTrainedModel", + tokenizer: "PreTrainedTokenizer", + model_args: "ModelArguments", + is_trainable: bool, + add_valuehead: bool, +) -> None: + gen_config = model.generation_config # check and fix generation config + if not gen_config.do_sample and ( + (gen_config.temperature is not None and gen_config.temperature != 1.0) + or (gen_config.top_p is not None and gen_config.top_p != 1.0) + or (gen_config.typical_p is not None and gen_config.typical_p != 1.0) + ): + gen_config.do_sample = True + + if "GenerationMixin" not in str(model.generate.__func__): + model.generate = MethodType(PreTrainedModel.generate, model) + + if add_valuehead: + prepare_valuehead_model(model) + + if model_args.resize_vocab: + resize_embedding_layer(model, tokenizer) + + if model_args.visual_inputs: + autocast_projector_dtype(model, model_args) + + if is_trainable: + prepare_model_for_training(model, model_args) + add_z3_leaf_module(model) + + if not model_args.use_unsloth: + print_attn_implementation(model.config) + + try: + model.add_model_tags(["llama-factory"]) + except Exception: + logger.warning("Cannot properly tag the model.") + + +def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None: + def tie_weights(self: "AutoModelForCausalLMWithValueHead") -> None: + if isinstance(self.pretrained_model, PreTrainedModel): + self.pretrained_model.tie_weights() + + def get_input_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module: + if isinstance(self.pretrained_model, PreTrainedModel): + return self.pretrained_model.get_input_embeddings() + + def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None: + if isinstance(self.pretrained_model, PeftModel): + self.pretrained_model.create_or_update_model_card(output_dir) + + ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name] + setattr(model, "_keys_to_ignore_on_save", ignore_modules) + setattr(model, "tie_weights", MethodType(tie_weights, model)) + setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model)) + setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/trainer.py new file mode 100644 index 0000000..d860b29 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/trainer.py @@ -0,0 +1,233 @@ +from collections import defaultdict +from contextlib import nullcontext +from types import MethodType +from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from transformers import Trainer +from trl import DPOTrainer +from trl.trainer import disable_dropout_in_model + +from ...extras.constants import IGNORE_INDEX +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context + + +if TYPE_CHECKING: + from transformers import PreTrainedModel, ProcessorMixin + + from ...hparams import FinetuningArguments + + +class CustomDPOTrainer(DPOTrainer): + def __init__( + self, + model: Union["PreTrainedModel", torch.nn.Module], + ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]], + finetuning_args: "FinetuningArguments", + processor: Optional["ProcessorMixin"], + disable_dropout: bool = True, + **kwargs, + ): + if disable_dropout: + disable_dropout_in_model(model) + if ref_model is not None: + disable_dropout_in_model(ref_model) + + self.finetuning_args = finetuning_args + self.processor = processor + self.reference_free = False + self.use_dpo_data_collator = True # hack to avoid warning + self.generate_during_eval = False # disable at evaluation + self.label_pad_token_id = IGNORE_INDEX + self.padding_value = 0 + self.is_encoder_decoder = model.config.is_encoder_decoder + self.precompute_ref_log_probs = False + self._precomputed_train_ref_log_probs = False + self._precomputed_eval_ref_log_probs = False + self._peft_has_been_casted_to_bf16 = False + + self.ref_model = ref_model + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # dpo hyperparams + self.beta = finetuning_args.pref_beta + self.loss_type = finetuning_args.pref_loss + self.ftx_gamma = finetuning_args.pref_ftx + self.label_smoothing = finetuning_args.dpo_label_smoothing + self.simpo_gamma = finetuning_args.simpo_gamma + + Trainer.__init__(self, model=model, **kwargs) + if not hasattr(self, "accelerator"): + raise AttributeError("Please update `transformers`.") + + if ref_model is not None: + if self.is_deepspeed_enabled: + if not ( + getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False) + ): # quantized models are already set on the correct device + self.ref_model = self._prepare_deepspeed(self.ref_model) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + self.ref_model.eval() + + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + + def odds_ratio_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": + r""" + Computes ORPO's odds ratio (OR) loss for batched log probabilities of the policy model. + """ + log_odds = (chosen_logps - rejected_logps) - ( + torch.log1p(-torch.exp(chosen_logps)) - torch.log1p(-torch.exp(rejected_logps)) + ) + sft_loss = -chosen_logps + odds_ratio_loss = -F.logsigmoid(log_odds) + orpo_loss = sft_loss + self.beta * odds_ratio_loss + return orpo_loss + + def simpo_loss(self, chosen_logps: "torch.Tensor", rejected_logps: "torch.Tensor") -> "torch.Tensor": + r""" + Computes SimPO loss for batched log probabilities of the policy model. + """ + pi_logratios = chosen_logps - rejected_logps + gamma_logratios = self.simpo_gamma / self.beta + logits = pi_logratios - gamma_logratios + simpo_loss = -F.logsigmoid(self.beta * logits) + return simpo_loss + + def compute_preference_loss( + self, + policy_chosen_logps: "torch.Tensor", + policy_rejected_logps: "torch.Tensor", + reference_chosen_logps: Optional["torch.Tensor"], + reference_rejected_logps: Optional["torch.Tensor"], + ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]: + r""" + Computes loss for preference learning. + """ + if not self.finetuning_args.use_ref_model: + if self.loss_type == "orpo": + losses = self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps) + elif self.loss_type == "simpo": + losses = self.simpo_loss(policy_chosen_logps, policy_rejected_logps) + else: + raise NotImplementedError("Unknown loss type: {}.".format(self.loss_type)) + + chosen_rewards = self.beta * policy_chosen_logps.to(self.accelerator.device).detach() + rejected_rewards = self.beta * policy_rejected_logps.to(self.accelerator.device).detach() + else: + losses, chosen_rewards, rejected_rewards = self.dpo_loss( + policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps + ) + + return losses, chosen_rewards, rejected_rewards + + def concatenated_forward( + self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] + ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: + r""" + Computes the sum log probabilities of the labels under given logits if loss_type is not IPO, ORPO or SimPO. + + Otherwise the average log probabilities. + """ + if self.finetuning_args.use_ref_model: + batch = {k: v.detach().clone() for k, v in batch.items()} # avoid error + + all_logits: "torch.Tensor" = model(**batch, return_dict=True, use_cache=False).logits.to(torch.float32) + + all_logps, valid_length = get_batch_logps(logits=all_logits, labels=batch["labels"]) + if self.loss_type in ["ipo", "orpo", "simpo"]: + all_logps = all_logps / valid_length + + batch_size = batch["input_ids"].size(0) // 2 + chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) + chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) + chosen_length, _ = valid_length.split(batch_size, dim=0) + return chosen_logps, rejected_logps, chosen_logits, rejected_logits, chosen_logps / chosen_length + + def compute_reference_log_probs( + self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] + ) -> Tuple[Optional["torch.Tensor"], Optional["torch.Tensor"]]: + r""" + Computes log probabilities of the reference model. + """ + if not self.finetuning_args.use_ref_model: + return None, None + + if self.ref_model is None: + ref_model = model + ref_context = get_ref_context(self.accelerator, model) + else: + ref_model = self.ref_model + ref_context = nullcontext() + + with torch.no_grad(), ref_context: + reference_chosen_logps, reference_rejected_logps, *_ = self.concatenated_forward(ref_model, batch) + + return reference_chosen_logps, reference_rejected_logps + + def get_batch_loss_metrics( + self, + model: "PreTrainedModel", + batch: Dict[str, "torch.Tensor"], + train_eval: Literal["train", "eval"] = "train", + ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]: + r""" + Computes the DPO loss and other metrics for the given batch of inputs for train or test. + """ + metrics = {} + ( + policy_chosen_logps, + policy_rejected_logps, + policy_chosen_logits, + policy_rejected_logits, + policy_chosen_logps_avg, + ) = self.concatenated_forward(model, batch) + + reference_chosen_logps, reference_rejected_logps = self.compute_reference_log_probs(model, batch) + losses, chosen_rewards, rejected_rewards = self.compute_preference_loss( + policy_chosen_logps, + policy_rejected_logps, + reference_chosen_logps, + reference_rejected_logps, + ) + sft_loss = -policy_chosen_logps_avg + if self.ftx_gamma > 1e-6: + losses += self.ftx_gamma * sft_loss + + reward_accuracies = (chosen_rewards > rejected_rewards).float() + + prefix = "eval_" if train_eval == "eval" else "" + metrics["{}rewards/chosen".format(prefix)] = chosen_rewards.mean().cpu() + metrics["{}rewards/rejected".format(prefix)] = rejected_rewards.mean().cpu() + metrics["{}rewards/accuracies".format(prefix)] = reward_accuracies.mean().cpu() + metrics["{}rewards/margins".format(prefix)] = (chosen_rewards - rejected_rewards).mean().cpu() + metrics["{}logps/rejected".format(prefix)] = policy_rejected_logps.detach().mean().cpu() + metrics["{}logps/chosen".format(prefix)] = policy_chosen_logps.detach().mean().cpu() + metrics["{}logits/rejected".format(prefix)] = policy_rejected_logits.detach().mean().cpu() + metrics["{}logits/chosen".format(prefix)] = policy_chosen_logits.detach().mean().cpu() + if self.loss_type == "orpo": + metrics["{}sft_loss".format(prefix)] = sft_loss.detach().mean().cpu() + metrics["{}odds_ratio_loss".format(prefix)] = ((losses - sft_loss) / self.beta).detach().mean().cpu() + + return losses.mean(), metrics diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/workflow.py new file mode 100644 index 0000000..992985b --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/dpo/workflow.py @@ -0,0 +1,83 @@ +# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py + +from typing import TYPE_CHECKING, List, Optional + +from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset +from ...extras.constants import IGNORE_INDEX +from ...extras.ploting import plot_loss +from ...hparams import ModelArguments +from ...model import load_model, load_tokenizer +from ..trainer_utils import create_modelcard_and_push, create_ref_model +from .trainer import CustomDPOTrainer + + +if TYPE_CHECKING: + from transformers import Seq2SeqTrainingArguments, TrainerCallback + + from ...hparams import DataArguments, FinetuningArguments + + +def run_dpo( + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + callbacks: Optional[List["TrainerCallback"]] = None, +): + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) + model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) + + data_collator = PairwiseDataCollatorWithPadding( + tokenizer=tokenizer, + pad_to_multiple_of=8, + label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, + ) + + # Create reference model + if finetuning_args.use_ref_model: + if finetuning_args.ref_model is None and (not training_args.do_train): # use the model itself + ref_model = model + else: + ref_model = create_ref_model(model_args, finetuning_args) + else: + ref_model = None + + # Update arguments + training_args.remove_unused_columns = False # important for pairwise dataset + + # Initialize our Trainer + trainer = CustomDPOTrainer( + model=model, + ref_model=ref_model, + args=training_args, + finetuning_args=finetuning_args, + data_collator=data_collator, + callbacks=callbacks, + **tokenizer_module, + **split_dataset(dataset, data_args, training_args), + ) + + # Training + if training_args.do_train: + train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint) + trainer.save_model() + trainer.log_metrics("train", train_result.metrics) + trainer.save_metrics("train", train_result.metrics) + trainer.save_state() + if trainer.is_world_process_zero() and finetuning_args.plot_loss: + plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "rewards/accuracies"]) + + # Evaluation + if training_args.do_eval: + metrics = trainer.evaluate(metric_key_prefix="eval") + if id(model) == id(ref_model): # unable to compute rewards if reference model is the model itself + remove_keys = [key for key in metrics.keys() if "rewards" in key] + for key in remove_keys: + metrics.pop(key) + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + # Create model card + create_modelcard_and_push(trainer, model_args, data_args, training_args, finetuning_args) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/__init__.py new file mode 100644 index 0000000..34c7905 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/__init__.py @@ -0,0 +1,4 @@ +from .workflow import run_kto + + +__all__ = ["run_kto"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/trainer.py new file mode 100644 index 0000000..22a84e4 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/trainer.py @@ -0,0 +1,205 @@ +from collections import defaultdict +from contextlib import nullcontext +from types import MethodType +from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union + +import torch +from transformers import Trainer +from trl import KTOTrainer +from trl.trainer import disable_dropout_in_model + +from ...extras.constants import IGNORE_INDEX +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler, get_batch_logps, get_ref_context + + +if TYPE_CHECKING: + import torch.utils.data + from transformers import PreTrainedModel, ProcessorMixin + + from ...hparams import FinetuningArguments + + +class CustomKTOTrainer(KTOTrainer): + def __init__( + self, + model: Union["PreTrainedModel", torch.nn.Module], + ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]], + finetuning_args: "FinetuningArguments", + processor: Optional["ProcessorMixin"], + disable_dropout: bool = True, + **kwargs, + ): + if disable_dropout: + disable_dropout_in_model(model) + if ref_model is not None: + disable_dropout_in_model(ref_model) + + self.finetuning_args = finetuning_args + self.processor = processor + self.reference_free = False + self.use_dpo_data_collator = True # hack to avoid warning + self.generate_during_eval = False # disable at evaluation + self.label_pad_token_id = IGNORE_INDEX + self.padding_value = 0 + self.is_encoder_decoder = model.config.is_encoder_decoder + self.precompute_ref_log_probs = False + self._precomputed_train_ref_log_probs = False + self._precomputed_eval_ref_log_probs = False + self._peft_has_been_casted_to_bf16 = False + + self.ref_model = ref_model + self._stored_metrics = defaultdict(lambda: defaultdict(list)) + + # kto hyperparams + self.beta = finetuning_args.pref_beta + self.desirable_weight = finetuning_args.kto_chosen_weight + self.undesirable_weight = finetuning_args.kto_rejected_weight + self.ftx_gamma = finetuning_args.pref_ftx + + Trainer.__init__(self, model=model, **kwargs) + if not hasattr(self, "accelerator"): + raise AttributeError("Please update `transformers`.") + + if ref_model is not None: + if self.is_deepspeed_enabled: + if not ( + getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False) + ): # quantized models are already set on the correct device + self.ref_model = self._prepare_deepspeed(self.ref_model) + else: + self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) + self.ref_model.eval() + + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def _get_train_sampler(self) -> Optional["torch.utils.data.Sampler"]: + r""" + Replaces the sequential sampler of KTO Trainer created by trl with the random sampler. + """ + return Trainer._get_train_sampler(self) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + + def forward( + self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"], prefix: Literal["", "kl_"] = "" + ) -> Tuple["torch.Tensor", "torch.Tensor"]: + r""" + Runs forward pass and computes the log probabilities. + """ + batch = {k: v.detach().clone() for k, v in batch.items()} # avoid error + model_inputs = { + "input_ids": batch["{}input_ids".format(prefix)], + "attention_mask": batch["{}attention_mask".format(prefix)], + } + if "pixel_values" in batch: + model_inputs["pixel_values"] = batch["pixel_values"] + + if "{}token_type_ids".format(prefix) in batch: + model_inputs["token_type_ids"] = batch["{}token_type_ids".format(prefix)] + + logits = model(**model_inputs, return_dict=True, use_cache=False).logits.to(torch.float32) + + logps, valid_length = get_batch_logps(logits=logits, labels=batch["{}labels".format(prefix)]) + return logps, logps / valid_length + + def concatenated_forward( + self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] + ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", "torch.Tensor"]: + target_logps, target_logps_avg = self.forward(model, batch) + with torch.no_grad(): + kl_logps, _ = self.forward(model, batch, prefix="kl_") + + if len(target_logps) != len(batch["kto_tags"]): + raise ValueError("Mismatched shape of inputs and labels.") + + chosen_logps = target_logps[batch["kto_tags"]] + rejected_logps = target_logps[~batch["kto_tags"]] + chosen_logps_avg = target_logps_avg[batch["kto_tags"]] + return chosen_logps, rejected_logps, kl_logps, chosen_logps_avg + + def compute_reference_log_probs( + self, model: "PreTrainedModel", batch: Dict[str, "torch.Tensor"] + ) -> Tuple["torch.Tensor", "torch.Tensor", "torch.Tensor"]: + r""" + Computes log probabilities of the reference model. + """ + if self.ref_model is None: + ref_model = model + ref_context = get_ref_context(self.accelerator, model) + else: + ref_model = self.ref_model + ref_context = nullcontext() + + with torch.no_grad(), ref_context: + reference_chosen_logps, reference_rejected_logps, reference_kl_logps, _ = self.concatenated_forward( + ref_model, batch + ) + + return reference_chosen_logps, reference_rejected_logps, reference_kl_logps + + def get_batch_loss_metrics( + self, + model: "PreTrainedModel", + batch: Dict[str, "torch.Tensor"], + ) -> Tuple["torch.Tensor", Dict[str, "torch.Tensor"]]: + r""" + Computes the DPO loss and other metrics for the given batch of inputs for train or test. + """ + metrics = {} + policy_chosen_logps, policy_rejected_logps, policy_kl_logps, policy_chosen_logps_avg = ( + self.concatenated_forward(model, batch) + ) + reference_chosen_logps, reference_rejected_logps, reference_kl_logps = self.compute_reference_log_probs( + model, batch + ) + losses, chosen_rewards, rejected_rewards, kl = self.kto_loss( + policy_chosen_logps, + policy_rejected_logps, + policy_kl_logps, + reference_chosen_logps, + reference_rejected_logps, + reference_kl_logps, + ) + losses = losses.nanmean() + + if self.ftx_gamma > 1e-6 and len(policy_chosen_logps) > 0: # remember to rescale + sft_loss = -policy_chosen_logps_avg + losses += self.ftx_gamma * sft_loss.nanmean() / len(policy_chosen_logps) * len(batch["labels"]) + + num_chosen = torch.Tensor([len(chosen_rewards)]).to(self.accelerator.device) + num_rejected = torch.Tensor([len(rejected_rewards)]).to(self.accelerator.device) + + all_num_chosen = self.accelerator.gather(num_chosen).sum().item() + all_num_rejected = self.accelerator.gather(num_rejected).sum().item() + + if all_num_chosen > 0: + metrics["rewards/chosen_sum"] = self.accelerator.gather(chosen_rewards.nansum()).nansum().item() + metrics["logps/chosen_sum"] = self.accelerator.gather(policy_chosen_logps.nansum()).nansum().item() + metrics["count/chosen"] = all_num_chosen + + if all_num_rejected > 0: + metrics["rewards/rejected_sum"] = self.accelerator.gather(rejected_rewards.nansum()).nansum().item() + metrics["logps/rejected_sum"] = self.accelerator.gather(policy_rejected_logps.nansum()).nansum().item() + metrics["count/rejected"] = all_num_rejected + + metrics["kl"] = kl.item() + + return losses, metrics diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/workflow.py similarity index 74% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/workflow.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/workflow.py index 39ea1a0..c79b160 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/workflow.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/kto/workflow.py @@ -1,15 +1,12 @@ -# Inspired by: https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py - from typing import TYPE_CHECKING, List, Optional -from ...data import get_dataset, split_dataset +from ...data import KTODataCollatorWithPadding, get_dataset, split_dataset from ...extras.constants import IGNORE_INDEX from ...extras.ploting import plot_loss from ...hparams import ModelArguments from ...model import load_model, load_tokenizer -from ..utils import create_custom_optimzer, create_modelcard_and_push, create_ref_model -from .collator import DPODataCollatorWithPadding -from .trainer import CustomDPOTrainer +from ..trainer_utils import create_modelcard_and_push, create_ref_model +from .trainer import CustomKTOTrainer if TYPE_CHECKING: @@ -18,17 +15,19 @@ if TYPE_CHECKING: from ...hparams import DataArguments, FinetuningArguments -def run_dpo( +def run_kto( model_args: "ModelArguments", data_args: "DataArguments", training_args: "Seq2SeqTrainingArguments", finetuning_args: "FinetuningArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm") + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="kto", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) - data_collator = DPODataCollatorWithPadding( + + data_collator = KTODataCollatorWithPadding( tokenizer=tokenizer, pad_to_multiple_of=8, label_pad_token_id=IGNORE_INDEX if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id, @@ -44,18 +43,14 @@ def run_dpo( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - optimizer = create_custom_optimzer(model, dataset, training_args, finetuning_args) - trainer = CustomDPOTrainer( - beta=finetuning_args.dpo_beta, - loss_type=finetuning_args.dpo_loss, - ftx_gamma=finetuning_args.dpo_ftx, + trainer = CustomKTOTrainer( model=model, ref_model=ref_model, args=training_args, - tokenizer=tokenizer, + finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - optimizers=(optimizer, None), + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) @@ -67,7 +62,7 @@ def run_dpo( trainer.save_metrics("train", train_result.metrics) trainer.save_state() if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "train/rewards/chosen"]) # Evaluation if training_args.do_eval: diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/utils.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/ppo_utils.py similarity index 63% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/utils.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/ppo_utils.py index e6bdb89..fec3fc1 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/utils.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/ppo_utils.py @@ -8,15 +8,19 @@ from transformers.integrations import is_deepspeed_zero3_enabled from ...extras.packages import is_requests_available -if TYPE_CHECKING: - from transformers import PreTrainedModel - from trl import AutoModelForCausalLMWithValueHead - if is_requests_available(): import requests +if TYPE_CHECKING: + from transformers import PreTrainedModel + from trl import AutoModelForCausalLMWithValueHead + + def get_rewards_from_server(server_url: str, messages: List[str]) -> List[torch.Tensor]: + r""" + Gets reward scores from the API server. + """ headers = {"Content-Type": "application/json"} payload = {"model": "model", "messages": messages} response = requests.post(server_url, json=payload, headers=headers) @@ -25,25 +29,33 @@ def get_rewards_from_server(server_url: str, messages: List[str]) -> List[torch. def replace_model(model: "AutoModelForCausalLMWithValueHead", target: Literal["default", "reward"]) -> None: + r""" + Replaces the default/reward modules in the model. The model is already unwrapped. + """ + v_head_layer = model.v_head.summary if is_deepspeed_zero3_enabled(): import deepspeed # type: ignore - params = [model.v_head.summary.weight, model.v_head.summary.bias] + params = [v_head_layer.weight, v_head_layer.bias] context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0) else: context_maybe_zero3 = nullcontext() + model.pretrained_model.set_adapter(target) # set the LoRA adapter to be active with context_maybe_zero3: if target == "reward": # save default head temporarily - setattr(model, "default_head_weight", model.v_head.summary.weight.data.detach().clone()) - setattr(model, "default_head_bias", model.v_head.summary.bias.data.detach().clone()) + setattr(model, "default_head_weight", v_head_layer.weight.data.detach().clone()) + setattr(model, "default_head_bias", v_head_layer.bias.data.detach().clone()) - model.pretrained_model.set_adapter(target) # set the LoRA adapter to be active - model.v_head.summary.weight.data = model.get_buffer("{}_head_weight".format(target)).detach().clone() - model.v_head.summary.bias.data = model.get_buffer("{}_head_bias".format(target)).detach().clone() + device = v_head_layer.weight.device + v_head_layer.weight.data = model.get_buffer("{}_head_weight".format(target)).detach().clone().to(device) + v_head_layer.bias.data = model.get_buffer("{}_head_bias".format(target)).detach().clone().to(device) def dump_layernorm(model: "PreTrainedModel") -> Dict[str, torch.Tensor]: + r""" + Dumps the layernorm parameters in the model. The model is already unwrapped (and gathered). + """ layer_norm_params = {} for name, param in model.named_parameters(): if param.data.dtype == torch.float32: @@ -54,6 +66,9 @@ def dump_layernorm(model: "PreTrainedModel") -> Dict[str, torch.Tensor]: def restore_layernorm(model: "PreTrainedModel", layernorm_params: Optional[Dict[str, torch.Tensor]] = None) -> None: + r""" + Restores the layernorm parameters in the model. The model is already unwrapped (and gathered). + """ for name, param in model.named_parameters(): if name in layernorm_params: param.data = layernorm_params[name] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/trainer.py similarity index 63% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/trainer.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/trainer.py index a06d7ef..2e1288e 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/trainer.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/trainer.py @@ -1,25 +1,37 @@ import math import os import sys -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from types import MethodType +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import torch +from accelerate.utils import DistributedDataParallelKwargs from tqdm import tqdm from transformers import GenerationConfig, Trainer, TrainerControl, TrainerState +from transformers.optimization import get_scheduler from transformers.trainer_pt_utils import remove_dummy_checkpoint from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME -from trl import PPOTrainer +from trl import PPOConfig, PPOTrainer from trl.core import PPODecorators, logprobs_from_logits +from trl.models.utils import unwrap_model_for_generation from ...extras.callbacks import FixValueHeadModelCallback, LogCallback from ...extras.logging import get_logger from ...extras.misc import AverageMeter, count_parameters, get_current_device, get_logits_processor -from .utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler +from .ppo_utils import dump_layernorm, get_rewards_from_server, replace_model, restore_layernorm if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments, TrainerCallback + from datasets import Dataset + from transformers import ( + DataCollatorWithPadding, + PreTrainedTokenizer, + ProcessorMixin, + Seq2SeqTrainingArguments, + TrainerCallback, + ) from trl import AutoModelForCausalLMWithValueHead from ...hparams import FinetuningArguments, GeneratingArguments, ModelArguments @@ -40,16 +52,68 @@ class CustomPPOTrainer(PPOTrainer, Trainer): finetuning_args: "FinetuningArguments", generating_args: "GeneratingArguments", callbacks: List["TrainerCallback"], - reward_model: "AutoModelForCausalLMWithValueHead", - **kwargs, + model: "AutoModelForCausalLMWithValueHead", + reward_model: Optional["AutoModelForCausalLMWithValueHead"], + ref_model: Optional["AutoModelForCausalLMWithValueHead"], + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + dataset: "Dataset", + data_collator: "DataCollatorWithPadding", ): - PPOTrainer.__init__(self, **kwargs) + backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps + ppo_config = PPOConfig( + model_name=model_args.model_name_or_path, + learning_rate=training_args.learning_rate, + mini_batch_size=training_args.per_device_train_batch_size, + batch_size=backward_batch_size * finetuning_args.ppo_buffer_size, + gradient_accumulation_steps=training_args.gradient_accumulation_steps, + ppo_epochs=finetuning_args.ppo_epochs, + max_grad_norm=training_args.max_grad_norm, + seed=training_args.seed, + optimize_device_cache=True, + target=finetuning_args.ppo_target, + use_score_scaling=finetuning_args.ppo_score_norm, + use_score_norm=finetuning_args.ppo_score_norm, + whiten_rewards=finetuning_args.ppo_whiten_rewards, + accelerator_kwargs={"step_scheduler_with_optimizer": False}, + log_with=training_args.report_to[0] if training_args.report_to else None, + project_kwargs={"logging_dir": training_args.logging_dir}, + ) + + # Add deepspeed config + ppo_config.accelerator_kwargs["kwargs_handlers"] = [ + DistributedDataParallelKwargs(find_unused_parameters=training_args.ddp_find_unused_parameters) + ] + if training_args.deepspeed_plugin is not None: + ppo_config.accelerator_kwargs["deepspeed_plugin"] = training_args.deepspeed_plugin + + # Create optimizer and scheduler + if training_args.max_steps > 0: + num_training_steps = training_args.max_steps + else: + total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size + num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size) + + optimizer = self.create_optimizer(model, training_args, finetuning_args) + scheduler = self.create_scheduler(training_args, num_training_steps, optimizer) + + PPOTrainer.__init__( + self, + config=ppo_config, + model=model, + ref_model=ref_model, + tokenizer=tokenizer, + dataset=dataset, + data_collator=data_collator, + lr_scheduler=scheduler, + ) self.args = training_args self.model_args = model_args self.finetuning_args = finetuning_args self.reward_model = reward_model self.current_device = get_current_device() # patch for deepspeed training + self.processor = processor self.generation_config = GenerationConfig( pad_token_id=self.tokenizer.pad_token_id, @@ -59,15 +123,20 @@ class CustomPPOTrainer(PPOTrainer, Trainer): self.state = TrainerState() self.control = TrainerControl() - self.is_deepspeed_enabled = self.accelerator.distributed_type == "DEEPSPEED" and hasattr( - self.accelerator.state, "deepspeed_plugin" - ) + self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None + self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None self.log_callback, self.save_callback = callbacks[0], callbacks[1] assert isinstance(self.log_callback, LogCallback) and isinstance(self.save_callback, FixValueHeadModelCallback) if self.args.max_steps > 0: logger.info("max_steps is given, it will override any value given in num_train_epochs") + unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) + self.is_chatglm_model = getattr(unwrapped_model.config, "model_type", None) == "chatglm" + + device_type = unwrapped_model.pretrained_model.device.type + self.amp_context = torch.autocast(device_type, dtype=model_args.compute_dtype) + if finetuning_args.reward_model_type == "full": if self.is_deepspeed_enabled: if not ( @@ -78,6 +147,11 @@ class CustomPPOTrainer(PPOTrainer, Trainer): else: self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True) + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + def ppo_train(self, resume_from_checkpoint: Optional[str] = None) -> None: r""" Implements training loop for the PPO stage, like _inner_training_loop() in Huggingface's Trainer. @@ -123,7 +197,6 @@ class CustomPPOTrainer(PPOTrainer, Trainer): logger.info(" Total training steps = {}".format(max_steps)) logger.info(" Number of trainable parameters = {}".format(count_parameters(self.model)[0])) - unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) dataiter = iter(self.dataloader) loss_meter = AverageMeter() reward_meter = AverageMeter() @@ -136,29 +209,21 @@ class CustomPPOTrainer(PPOTrainer, Trainer): dataiter = iter(self.dataloader) batch = next(dataiter) - # Cast to inference mode - unwrapped_model.gradient_checkpointing_disable() - unwrapped_model.config.use_cache = True - self.model.eval() - # Get inputs + self.model.eval() self.tokenizer.padding_side = "right" # change padding side queries, responses, rewards = [], [], [] for idx in range(0, self.config.batch_size, self.config.mini_batch_size): mini_batch_queries, mini_batch_responses = self.get_inputs( batch[idx : idx + self.config.mini_batch_size] ) - mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses, unwrapped_model) + mini_batch_rewards = self.get_rewards(mini_batch_queries, mini_batch_responses) queries.extend(mini_batch_queries) responses.extend(mini_batch_responses) rewards.extend(mini_batch_rewards) - # Cast to training mode - unwrapped_model.gradient_checkpointing_enable() - unwrapped_model.config.use_cache = False - self.model.train() - # Run PPO step + self.model.train() stats = self.step(queries, responses, rewards) self.tokenizer.padding_side = "left" # restore padding side loss_meter.update(float(stats["ppo/loss/total"]), n=len(rewards)) @@ -205,26 +270,64 @@ class CustomPPOTrainer(PPOTrainer, Trainer): self.args, self.state, self.control, model=self.accelerator.unwrap_model(self.model) ) + def create_optimizer( + self, + model: "AutoModelForCausalLMWithValueHead", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", + ) -> "torch.optim.Optimizer": + optimizer = create_custom_optimzer(model, training_args, finetuning_args) + if optimizer is None: + decay_params, nodecay_params = [], [] + decay_param_names = self.get_decay_parameter_names(model) + for name, param in model.named_parameters(): + if param.requires_grad: + if name in decay_param_names: + decay_params.append(param) + else: + nodecay_params.append(param) + + optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + param_groups = [ + dict(params=nodecay_params), + dict(params=decay_params, weight_decay=training_args.weight_decay), + ] + optimizer = optim_class(param_groups, **optim_kwargs) + + return optimizer + + def create_scheduler( + self, training_args: "Seq2SeqTrainingArguments", num_training_steps: int, optimizer: "torch.optim.Optimizer" + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(training_args, num_training_steps, optimizer) + lr_scheduler = get_scheduler( + training_args.lr_scheduler_type, + optimizer=optimizer, + num_warmup_steps=training_args.get_warmup_steps(num_training_steps), + num_training_steps=num_training_steps, + ) + return lr_scheduler + @torch.no_grad() - def get_inputs(self, batch: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + def get_inputs(self, batch: Dict[str, "torch.Tensor"]) -> Tuple[List["torch.Tensor"], List["torch.Tensor"]]: r""" Generates model's responses given queries. """ - if self.model_args.upcast_layernorm: - layernorm_params = dump_layernorm(self.model) - if batch["input_ids"].size(0) == 1: # handle llama2 ppo with gradient accumulation > 1 start_index = (batch["input_ids"][0] != self.tokenizer.pad_token_id).nonzero()[0].item() for k, v in batch.items(): batch[k] = v[:, start_index:] - unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) - generate_output: torch.Tensor = unwrapped_model.generate( - generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch - ) + with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model: + unwrapped_model = self.accelerator.unwrap_model(self.model) # issue in trl v0.8.6 + if self.model_args.upcast_layernorm: + layernorm_params = dump_layernorm(unwrapped_model) - if self.model_args.upcast_layernorm: - restore_layernorm(self.model, layernorm_params) + generate_output: torch.Tensor = unwrapped_model.generate( + generation_config=self.generation_config, logits_processor=get_logits_processor(), **batch + ) + if self.model_args.upcast_layernorm: + restore_layernorm(unwrapped_model, layernorm_params) query = batch["input_ids"].detach().cpu() response = generate_output[:, batch["input_ids"].size(-1) :].detach().cpu() @@ -246,10 +349,9 @@ class CustomPPOTrainer(PPOTrainer, Trainer): @torch.no_grad() def get_rewards( self, - queries: List[torch.Tensor], - responses: List[torch.Tensor], - unwrapped_model: "AutoModelForCausalLMWithValueHead", - ) -> List[torch.Tensor]: + queries: List["torch.Tensor"], + responses: List["torch.Tensor"], + ) -> List["torch.Tensor"]: r""" Computes scores using given reward model. @@ -260,18 +362,22 @@ class CustomPPOTrainer(PPOTrainer, Trainer): messages = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True) return get_rewards_from_server(self.reward_model, messages) + batch = self.prepare_model_inputs(queries, responses) + unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) + if self.finetuning_args.reward_model_type == "lora": replace_model(unwrapped_model, target="reward") reward_model = self.model else: reward_model = self.reward_model - batch = self.prepare_model_inputs(queries, responses) + with unwrap_model_for_generation(reward_model, self.accelerator), self.amp_context: # support bf16 + _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True, use_cache=False) - with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype): # support bf16 - _, _, values = reward_model(**batch, output_hidden_states=True, return_dict=True) + if self.finetuning_args.reward_model_type == "lora": + replace_model(unwrapped_model, target="default") - if getattr(unwrapped_model.config, "model_type", None) == "chatglm": # assume same architecture + if self.is_chatglm_model: # assume same architecture values = torch.transpose(values, 0, 1) rewards = [] @@ -280,21 +386,18 @@ class CustomPPOTrainer(PPOTrainer, Trainer): end_index = end_indexes[-1].item() if len(end_indexes) else 0 rewards.append(values[i, end_index].float().detach().cpu()) # use fp32 type - if self.finetuning_args.reward_model_type == "lora": - replace_model(unwrapped_model, target="default") - return rewards @PPODecorators.empty_device_cache() def batched_forward_pass( self, model: "AutoModelForCausalLMWithValueHead", - queries: torch.Tensor, - responses: torch.Tensor, - model_inputs: dict, + queries: "torch.Tensor", + responses: "torch.Tensor", + model_inputs: Dict[str, Any], return_logits: bool = False, - response_masks: Optional[torch.Tensor] = None, - ): + response_masks: Optional["torch.Tensor"] = None, + ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], "torch.Tensor", "torch.Tensor"]: r""" Calculates model outputs in multiple batches. @@ -316,11 +419,10 @@ class CustomPPOTrainer(PPOTrainer, Trainer): input_ids = input_kwargs["input_ids"] attention_mask = input_kwargs["attention_mask"] - with torch.cuda.amp.autocast(dtype=self.model_args.compute_dtype): # support bf16 + with self.amp_context: # support bf16 logits, _, values = model(**input_kwargs) - unwrapped_model: "AutoModelForCausalLMWithValueHead" = self.accelerator.unwrap_model(self.model) - if getattr(unwrapped_model.config, "model_type", None) == "chatglm": + if self.is_chatglm_model: values = torch.transpose(values, 0, 1) logprobs = logprobs_from_logits(logits[:, :-1, :], input_ids[:, 1:]) @@ -363,14 +465,28 @@ class CustomPPOTrainer(PPOTrainer, Trainer): Subclass and override to inject custom behavior. """ - if self.args.should_save: + if output_dir is None: + output_dir = self.args.output_dir + + if self.is_fsdp_enabled or self.is_deepspeed_enabled: try: - self._save(output_dir, state_dict=self.accelerator.get_state_dict(self.model)) + state_dict = self.accelerator.get_state_dict(self.model) # must be called at all ranks + if self.args.should_save: + self._save(output_dir, state_dict=state_dict) except ValueError: logger.warning( " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead," " use zero_to_fp32.py to recover weights" ) - self._save(output_dir, state_dict={}) - remove_dummy_checkpoint(True, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]) + if self.args.should_save: + self._save(output_dir, state_dict={}) + # remove the dummy state_dict + remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]) self.model.save_checkpoint(output_dir) + + elif self.args.should_save: + self._save(output_dir) + + if self.processor is not None and self.args.should_save: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/workflow.py similarity index 50% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/workflow.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/workflow.py index de9f2a2..111704c 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/ppo/workflow.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/ppo/workflow.py @@ -1,19 +1,15 @@ # Inspired by: https://github.com/lvwerra/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py -import math from typing import TYPE_CHECKING, List, Optional -from torch.optim import AdamW from transformers import DataCollatorWithPadding -from transformers.optimization import get_scheduler -from trl import PPOConfig from ...data import get_dataset from ...extras.callbacks import FixValueHeadModelCallback from ...extras.misc import fix_valuehead_checkpoint from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..utils import create_custom_optimzer, create_ref_model, create_reward_model +from ..trainer_utils import create_ref_model, create_reward_model from .trainer import CustomPPOTrainer @@ -31,8 +27,9 @@ def run_ppo( generating_args: "GeneratingArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="ppo") + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="ppo", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True) tokenizer.padding_side = "left" # use left-padding in generation while using right-padding in training @@ -42,45 +39,6 @@ def run_ppo( ref_model = create_ref_model(model_args, finetuning_args, add_valuehead=True) reward_model = create_reward_model(model, model_args, finetuning_args) - # Create ppo config - backward_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps - ppo_config = PPOConfig( - model_name=model_args.model_name_or_path, - learning_rate=training_args.learning_rate, - mini_batch_size=training_args.per_device_train_batch_size, - batch_size=backward_batch_size * finetuning_args.ppo_buffer_size, - gradient_accumulation_steps=training_args.gradient_accumulation_steps, - ppo_epochs=finetuning_args.ppo_epochs, - max_grad_norm=training_args.max_grad_norm, - seed=training_args.seed, - optimize_device_cache=True, - target=finetuning_args.ppo_target, - log_with=finetuning_args.ppo_logger, - use_score_scaling=finetuning_args.ppo_score_norm, - use_score_norm=finetuning_args.ppo_score_norm, - whiten_rewards=finetuning_args.ppo_whiten_rewards, - accelerator_kwargs={"step_scheduler_with_optimizer": False}, - project_kwargs={"logging_dir": training_args.logging_dir}, - ) - - # Create optimizer and scheduler - optimizer = create_custom_optimzer(model, dataset, training_args, finetuning_args) - if optimizer is None: - optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=training_args.learning_rate) - - if training_args.max_steps > 0: - num_training_steps = training_args.max_steps - else: - total_train_batch_size = backward_batch_size * finetuning_args.ppo_buffer_size * training_args.world_size - num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size) - - lr_scheduler = get_scheduler( - training_args.lr_scheduler_type, - optimizer=optimizer, - num_warmup_steps=training_args.get_warmup_steps(num_training_steps), - num_training_steps=num_training_steps, - ) - # Initialize our Trainer ppo_trainer = CustomPPOTrainer( model_args=model_args, @@ -88,15 +46,12 @@ def run_ppo( finetuning_args=finetuning_args, generating_args=generating_args, callbacks=callbacks + [FixValueHeadModelCallback()], - reward_model=reward_model, - config=ppo_config, model=model, + reward_model=reward_model, ref_model=ref_model, - tokenizer=tokenizer, dataset=dataset, data_collator=data_collator, - optimizer=optimizer, - lr_scheduler=lr_scheduler, + **tokenizer_module, ) # Training diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/pt/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/pt/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/trainer.py new file mode 100644 index 0000000..1d96e82 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/trainer.py @@ -0,0 +1,51 @@ +from types import MethodType +from typing import TYPE_CHECKING, Dict, Optional + +from transformers import Trainer + +from ...extras.logging import get_logger +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler + + +if TYPE_CHECKING: + import torch + from transformers import ProcessorMixin + + from ...hparams import FinetuningArguments + + +logger = get_logger(__name__) + + +class CustomTrainer(Trainer): + r""" + Inherits Trainer for custom optimizer. + """ + + def __init__( + self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs + ) -> None: + super().__init__(**kwargs) + self.finetuning_args = finetuning_args + self.processor = processor + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/pt/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/workflow.py similarity index 82% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/pt/workflow.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/workflow.py index 5a08854..8a63556 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/pt/workflow.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/pt/workflow.py @@ -3,12 +3,13 @@ import math from typing import TYPE_CHECKING, List, Optional -from transformers import DataCollatorForLanguageModeling, Trainer +from transformers import DataCollatorForLanguageModeling from ...data import get_dataset, split_dataset from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..utils import create_custom_optimzer, create_modelcard_and_push +from ..trainer_utils import create_modelcard_and_push +from .trainer import CustomTrainer if TYPE_CHECKING: @@ -24,20 +25,20 @@ def run_pt( finetuning_args: "FinetuningArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="pt") + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="pt", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Initialize our Trainer - optimizer = create_custom_optimzer(model, dataset, training_args, finetuning_args) - trainer = Trainer( + trainer = CustomTrainer( model=model, args=training_args, - tokenizer=tokenizer, + finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - optimizers=(optimizer, None), + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/metric.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/metric.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/metric.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/metric.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/trainer.py similarity index 69% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/trainer.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/trainer.py index f7e104c..bfb344d 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/trainer.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/trainer.py @@ -1,29 +1,58 @@ import json import os -from typing import TYPE_CHECKING, Dict, List, Tuple, Union +from types import MethodType +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch from transformers import Trainer from ...extras.logging import get_logger +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: - from transformers.modeling_utils import PreTrainedModel + from transformers import PreTrainedModel, ProcessorMixin from transformers.trainer import PredictionOutput + from ...hparams import FinetuningArguments + logger = get_logger(__name__) class PairwiseTrainer(Trainer): r""" - Inherits PeftTrainer to compute pairwise loss. + Inherits Trainer to compute pairwise loss. """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__( + self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs + ) -> None: + super().__init__(**kwargs) + self.finetuning_args = finetuning_args + self.processor = processor self.can_return_loss = True # override property to return eval_loss + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) def compute_loss( self, model: "PreTrainedModel", inputs: Dict[str, torch.Tensor], return_outputs: bool = False @@ -34,7 +63,7 @@ class PairwiseTrainer(Trainer): Subclass and override to inject custom behavior. Note that the first element will be removed from the output tuple. - See: https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/trainer.py#L3509 + See: https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/trainer.py#L3777 """ # Compute rewards _, _, values = model(**inputs, output_hidden_states=True, return_dict=True) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/workflow.py similarity index 85% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/workflow.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/workflow.py index 9dfef30..2e9e194 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/workflow.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/rm/workflow.py @@ -2,13 +2,12 @@ from typing import TYPE_CHECKING, List, Optional -from ...data import get_dataset, split_dataset +from ...data import PairwiseDataCollatorWithPadding, get_dataset, split_dataset from ...extras.callbacks import FixValueHeadModelCallback from ...extras.misc import fix_valuehead_checkpoint from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ..utils import create_custom_optimzer, create_modelcard_and_push -from .collator import PairwiseDataCollatorWithPadding +from ..trainer_utils import create_modelcard_and_push from .metric import compute_accuracy from .trainer import PairwiseTrainer @@ -26,8 +25,9 @@ def run_rm( finetuning_args: "FinetuningArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="rm") + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="rm", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train, add_valuehead=True) data_collator = PairwiseDataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) @@ -35,15 +35,14 @@ def run_rm( training_args.remove_unused_columns = False # important for pairwise dataset # Initialize our Trainer - optimizer = create_custom_optimzer(model, dataset, training_args, finetuning_args) trainer = PairwiseTrainer( model=model, args=training_args, - tokenizer=tokenizer, + finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks + [FixValueHeadModelCallback()], - optimizers=(optimizer, None), compute_metrics=compute_accuracy, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) @@ -57,7 +56,7 @@ def run_rm( trainer.save_metrics("train", train_result.metrics) trainer.save_state() if trainer.is_world_process_zero() and finetuning_args.plot_loss: - plot_loss(training_args.output_dir, keys=["loss", "eval_loss"]) + plot_loss(training_args.output_dir, keys=["loss", "eval_loss", "eval_accuracy"]) # Evaluation if training_args.do_eval: diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/metric.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/metric.py similarity index 99% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/metric.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/metric.py index d1af4c1..b135fcf 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/metric.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/metric.py @@ -10,12 +10,15 @@ from ...extras.packages import is_jieba_available, is_nltk_available, is_rouge_a if TYPE_CHECKING: from transformers.tokenization_utils import PreTrainedTokenizer + if is_jieba_available(): import jieba # type: ignore + if is_nltk_available(): from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu + if is_rouge_available(): from rouge_chinese import Rouge diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/trainer.py similarity index 70% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/trainer.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/trainer.py index 36d09f3..c063b21 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/trainer.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/trainer.py @@ -1,31 +1,63 @@ import json import os +from types import MethodType from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import numpy as np import torch -import torch.nn as nn from transformers import Seq2SeqTrainer from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger +from ..trainer_utils import create_custom_optimzer, create_custom_scheduler if TYPE_CHECKING: + from transformers import ProcessorMixin from transformers.trainer import PredictionOutput + from ...hparams import FinetuningArguments + logger = get_logger(__name__) class CustomSeq2SeqTrainer(Seq2SeqTrainer): r""" - Inherits PeftTrainer to compute generative metrics such as BLEU and ROUGE. + Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE. """ + def __init__( + self, finetuning_args: "FinetuningArguments", processor: Optional["ProcessorMixin"], **kwargs + ) -> None: + super().__init__(**kwargs) + self.finetuning_args = finetuning_args + self.processor = processor + if finetuning_args.use_badam: + from badam import clip_grad_norm_for_sparse_tensor + + self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_for_sparse_tensor, self.accelerator) + + def create_optimizer(self) -> "torch.optim.Optimizer": + if self.optimizer is None: + self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args) + return super().create_optimizer() + + def create_scheduler( + self, num_training_steps: int, optimizer: Optional["torch.optim.Optimizer"] = None + ) -> "torch.optim.lr_scheduler.LRScheduler": + create_custom_scheduler(self.args, num_training_steps, optimizer) + return super().create_scheduler(num_training_steps, optimizer) + + def _save(self, output_dir: Optional[str] = None, state_dict: Optional[Dict[str, "torch.Tensor"]] = None) -> None: + super()._save(output_dir, state_dict) + if self.processor is not None: + output_dir = output_dir if output_dir is not None else self.args.output_dir + getattr(self.processor, "image_processor").save_pretrained(output_dir) + def prediction_step( self, - model: nn.Module, + model: "torch.nn.Module", inputs: Dict[str, Union[torch.Tensor, Any]], prediction_loss_only: bool, ignore_keys: Optional[List[str]] = None, diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/workflow.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/workflow.py similarity index 88% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/workflow.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/workflow.py index 099edc1..f09b517 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/sft/workflow.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/sft/workflow.py @@ -9,10 +9,9 @@ from ...extras.constants import IGNORE_INDEX from ...extras.misc import get_logits_processor from ...extras.ploting import plot_loss from ...model import load_model, load_tokenizer -from ...train.sft.metric import ComputeMetrics -from ...train.sft.trainer import CustomSeq2SeqTrainer -from ...train.utils import create_modelcard_and_push -from ..utils import create_custom_optimzer +from ..trainer_utils import create_modelcard_and_push +from .metric import ComputeMetrics +from .trainer import CustomSeq2SeqTrainer if TYPE_CHECKING: @@ -29,8 +28,9 @@ def run_sft( generating_args: "GeneratingArguments", callbacks: Optional[List["TrainerCallback"]] = None, ): - tokenizer = load_tokenizer(model_args) - dataset = get_dataset(tokenizer, model_args, data_args, training_args, stage="sft") + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + dataset = get_dataset(model_args, data_args, training_args, stage="sft", **tokenizer_module) model = load_model(tokenizer, model_args, finetuning_args, training_args.do_train) if training_args.predict_with_generate: @@ -48,17 +48,17 @@ def run_sft( # Override the decoding parameters of Seq2SeqTrainer training_args.generation_max_length = training_args.generation_max_length or data_args.cutoff_len training_args.generation_num_beams = data_args.eval_num_beams or training_args.generation_num_beams + training_args.remove_unused_columns = False if model_args.visual_inputs else training_args.remove_unused_columns # Initialize our Trainer - optimizer = create_custom_optimzer(model, dataset, training_args, finetuning_args) trainer = CustomSeq2SeqTrainer( model=model, args=training_args, - tokenizer=tokenizer, + finetuning_args=finetuning_args, data_collator=data_collator, callbacks=callbacks, - optimizers=(optimizer, None), compute_metrics=ComputeMetrics(tokenizer) if training_args.predict_with_generate else None, + **tokenizer_module, **split_dataset(dataset, data_args, training_args), ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/trainer_utils.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/trainer_utils.py new file mode 100644 index 0000000..0ddcdb1 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/trainer_utils.py @@ -0,0 +1,423 @@ +from contextlib import contextmanager +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union + +import torch +from transformers import Trainer +from transformers.optimization import get_scheduler +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS +from transformers.trainer_pt_utils import get_parameter_names + +from ..extras.constants import IGNORE_INDEX +from ..extras.logging import get_logger +from ..extras.packages import is_galore_available +from ..hparams import FinetuningArguments, ModelArguments +from ..model import find_all_linear_modules, load_model, load_tokenizer, load_valuehead_params + + +if is_galore_available(): + from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit + + +if TYPE_CHECKING: + from accelerate import Accelerator + from transformers import PreTrainedModel, Seq2SeqTrainingArguments + from trl import AutoModelForCausalLMWithValueHead + + from ..hparams import DataArguments + + +logger = get_logger(__name__) + + +class DummyOptimizer(torch.optim.Optimizer): + r""" + A dummy optimizer used for the GaLore algorithm. + """ + + def __init__( + self, lr: float = 1e-3, optimizer_dict: Optional[Dict["torch.nn.Parameter", "torch.optim.Optimizer"]] = None + ) -> None: + dummy_tensor = torch.randn(1, 1) + self.optimizer_dict = optimizer_dict + super().__init__([dummy_tensor], {"lr": lr}) + + def zero_grad(self, set_to_none: bool = True) -> None: + pass + + def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: + pass + + +def create_modelcard_and_push( + trainer: "Trainer", + model_args: "ModelArguments", + data_args: "DataArguments", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> None: + kwargs = { + "tasks": "text-generation", + "finetuned_from": model_args.model_name_or_path, + "tags": ["llama-factory", finetuning_args.finetuning_type], + } + if data_args.dataset is not None: + kwargs["dataset"] = [dataset.strip() for dataset in data_args.dataset.split(",")] + + if model_args.use_unsloth: + kwargs["tags"] = kwargs["tags"] + ["unsloth"] + + if not training_args.do_train: + pass + elif training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(license="other", **kwargs) # prevent from connecting to hub + + +def create_ref_model( + model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False +) -> Optional[Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]]: + r""" + Creates reference model for PPO/DPO training. Evaluation mode is not supported. + + The valuehead parameter is randomly initialized since it is useless for PPO training. + """ + if finetuning_args.ref_model is not None: + ref_model_args_dict = model_args.to_dict() + ref_model_args_dict.update( + dict( + model_name_or_path=finetuning_args.ref_model, + adapter_name_or_path=finetuning_args.ref_model_adapters, + quantization_bit=finetuning_args.ref_model_quantization_bit, + ) + ) + ref_model_args = ModelArguments(**ref_model_args_dict) + ref_finetuning_args = FinetuningArguments() + tokenizer = load_tokenizer(ref_model_args)["tokenizer"] + ref_model = load_model( + tokenizer, ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead + ) + logger.info("Created reference model from {}".format(finetuning_args.ref_model)) + else: + if finetuning_args.finetuning_type == "lora": + ref_model = None + else: + tokenizer = load_tokenizer(model_args)["tokenizer"] + ref_model = load_model( + tokenizer, model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead + ) + logger.info("Created reference model from the model itself.") + + return ref_model + + +def create_reward_model( + model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments" +) -> Optional["AutoModelForCausalLMWithValueHead"]: + r""" + Creates reward model for PPO training. + """ + if finetuning_args.reward_model_type == "api": + assert finetuning_args.reward_model.startswith("http"), "Please provide full url." + logger.info("Use reward server {}".format(finetuning_args.reward_model)) + return finetuning_args.reward_model + elif finetuning_args.reward_model_type == "lora": + model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward") + for name, param in model.named_parameters(): # https://github.com/huggingface/peft/issues/1090 + if "default" in name: + param.data = param.data.to(torch.float32) # trainable params should in fp32 + vhead_params = load_valuehead_params(finetuning_args.reward_model, model_args) + assert vhead_params is not None, "Reward model is not correctly loaded." + model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False) + model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False) + model.register_buffer( + "default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False + ) + model.register_buffer( + "default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False + ) + logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model)) + return None + else: + reward_model_args_dict = model_args.to_dict() + reward_model_args_dict.update( + dict( + model_name_or_path=finetuning_args.reward_model, + adapter_name_or_path=finetuning_args.reward_model_adapters, + quantization_bit=finetuning_args.reward_model_quantization_bit, + ) + ) + reward_model_args = ModelArguments(**reward_model_args_dict) + reward_finetuning_args = FinetuningArguments() + tokenizer = load_tokenizer(reward_model_args)["tokenizer"] + reward_model = load_model( + tokenizer, reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True + ) + logger.info("Loaded full weights of reward model from {}".format(finetuning_args.reward_model)) + logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.") + return reward_model + + +@contextmanager +def get_ref_context(accelerator: "Accelerator", model: "PreTrainedModel"): + r""" + Gets adapter context for the reference model. + """ + with accelerator.unwrap_model(model).disable_adapter(): + model.eval() + yield + model.train() + + +def _get_decay_parameter_names(model: "PreTrainedModel") -> List[str]: + r""" + Returns a list of names of parameters with weight decay. (weights in non-layernorm layers) + """ + decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + return decay_parameters + + +def _create_galore_optimizer( + model: "PreTrainedModel", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> "torch.optim.Optimizer": + if len(finetuning_args.galore_target) == 1 and finetuning_args.galore_target[0] == "all": + galore_targets = find_all_linear_modules(model) + else: + galore_targets = finetuning_args.galore_target + + galore_params: List["torch.nn.Parameter"] = [] + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear) and any(target in name for target in galore_targets): + for param in module.parameters(): + if param.requires_grad and len(param.shape) > 1: + galore_params.append(param) + + galore_kwargs = { + "rank": finetuning_args.galore_rank, + "update_proj_gap": finetuning_args.galore_update_interval, + "scale": finetuning_args.galore_scale, + "proj_type": finetuning_args.galore_proj_type, + } + + id_galore_params = {id(param) for param in galore_params} + decay_params, nodecay_params = [], [] # they are non-galore parameters + trainable_params: List["torch.nn.Parameter"] = [] # galore_params + decay_params + nodecay_params + decay_param_names = _get_decay_parameter_names(model) + for name, param in model.named_parameters(): + if param.requires_grad: + trainable_params.append(param) + if id(param) not in id_galore_params: + if name in decay_param_names: + decay_params.append(param) + else: + nodecay_params.append(param) + + _, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + + if training_args.optim == "adamw_torch": + optim_class = GaLoreAdamW + elif training_args.optim in ["adamw_bnb_8bit", "adamw_8bit", "paged_adamw_8bit"]: + optim_class = GaLoreAdamW8bit + elif training_args.optim == "adafactor": + optim_class = GaLoreAdafactor + else: + raise NotImplementedError("Unknow optim: {}".format(training_args.optim)) + + if finetuning_args.galore_layerwise: + if training_args.gradient_accumulation_steps != 1: + raise ValueError("Per-layer GaLore does not support gradient accumulation.") + + optimizer_dict: Dict["torch.Tensor", "torch.optim.Optimizer"] = {} + for param in nodecay_params: + param_groups = [dict(params=[param], weight_decay=0.0)] + optimizer_dict[param] = optim_class(param_groups, **optim_kwargs) + for param in decay_params: + param_groups = [dict(params=[param], weight_decay=training_args.weight_decay)] + optimizer_dict[param] = optim_class(param_groups, **optim_kwargs) + for param in galore_params: # galore params have weight decay + param_groups = [dict(params=[param], weight_decay=training_args.weight_decay, **galore_kwargs)] + optimizer_dict[param] = optim_class(param_groups, **optim_kwargs) + + def optimizer_hook(param: "torch.nn.Parameter"): + if param.grad is not None: + optimizer_dict[param].step() + optimizer_dict[param].zero_grad() + + for param in trainable_params: + param.register_post_accumulate_grad_hook(optimizer_hook) + + optimizer = DummyOptimizer(lr=training_args.learning_rate, optimizer_dict=optimizer_dict) + else: + param_groups = [ + dict(params=nodecay_params, weight_decay=0.0), + dict(params=decay_params, weight_decay=training_args.weight_decay), + dict(params=galore_params, weight_decay=training_args.weight_decay, **galore_kwargs), + ] + optimizer = optim_class(param_groups, **optim_kwargs) + + logger.info("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.") + return optimizer + + +def _create_loraplus_optimizer( + model: "PreTrainedModel", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> "torch.optim.Optimizer": + default_lr = training_args.learning_rate + loraplus_lr = training_args.learning_rate * finetuning_args.loraplus_lr_ratio + embedding_lr = finetuning_args.loraplus_lr_embedding + + decay_param_names = _get_decay_parameter_names(model) + param_dict: Dict[str, List["torch.nn.Parameter"]] = { + "lora_a": [], + "lora_b": [], + "lora_b_nodecay": [], + "embedding": [], + } + for name, param in model.named_parameters(): + if param.requires_grad: + if "lora_embedding_B" in name: + param_dict["embedding"].append(param) + elif "lora_B" in name or param.ndim == 1: + if name in decay_param_names: + param_dict["lora_b"].append(param) + else: + param_dict["lora_b_nodecay"].append(param) + else: + param_dict["lora_a"].append(param) + + optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + param_groups = [ + dict(params=param_dict["lora_a"], lr=default_lr, weight_decay=training_args.weight_decay), + dict(params=param_dict["lora_b"], lr=loraplus_lr, weight_decay=training_args.weight_decay), + dict(params=param_dict["lora_b_nodecay"], lr=loraplus_lr, weight_decay=0.0), + dict(params=param_dict["embedding"], lr=embedding_lr, weight_decay=training_args.weight_decay), + ] + optimizer = optim_class(param_groups, **optim_kwargs) + logger.info("Using LoRA+ optimizer with loraplus lr ratio {:.2f}.".format(finetuning_args.loraplus_lr_ratio)) + return optimizer + + +def _create_badam_optimizer( + model: "PreTrainedModel", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> "torch.optim.Optimizer": + decay_params, nodecay_params = [], [] + decay_param_names = _get_decay_parameter_names(model) + for name, param in model.named_parameters(): + if param.requires_grad: + if name in decay_param_names: + decay_params.append(param) + else: + nodecay_params.append(param) + + optim_class, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(training_args) + param_groups = [ + dict(params=nodecay_params, weight_decay=0.0), + dict(params=decay_params, weight_decay=training_args.weight_decay), + ] + + if finetuning_args.badam_mode == "layer": + from badam import BlockOptimizer + + base_optimizer = optim_class(param_groups, **optim_kwargs) + optimizer = BlockOptimizer( + base_optimizer=base_optimizer, + named_parameters_list=list(model.named_parameters()), + block_prefix_list=None, + switch_block_every=finetuning_args.badam_switch_interval, + start_block=finetuning_args.badam_start_block, + switch_mode=finetuning_args.badam_switch_mode, + verbose=finetuning_args.badam_verbose, + ) + logger.info( + f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, " + f"switch block every {finetuning_args.badam_switch_interval} steps, " + f"default start block is {finetuning_args.badam_start_block}" + ) + + elif finetuning_args.badam_mode == "ratio": + from badam import BlockOptimizerRatio + + assert finetuning_args.badam_update_ratio > 1e-6 + optimizer = BlockOptimizerRatio( + param_groups=param_groups, + named_parameters_list=list(model.named_parameters()), + update_ratio=finetuning_args.badam_update_ratio, + mask_mode=finetuning_args.badam_mask_mode, + verbose=finetuning_args.badam_verbose, + include_embedding=False, + **optim_kwargs, + ) + logger.info( + f"Using BAdam optimizer with ratio-wise update, update ratio is {finetuning_args.badam_update_ratio}, " + f"mask mode is {finetuning_args.badam_mask_mode}" + ) + + return optimizer + + +def create_custom_optimzer( + model: "PreTrainedModel", + training_args: "Seq2SeqTrainingArguments", + finetuning_args: "FinetuningArguments", +) -> Optional["torch.optim.Optimizer"]: + if finetuning_args.use_galore: + return _create_galore_optimizer(model, training_args, finetuning_args) + + if finetuning_args.loraplus_lr_ratio is not None: + return _create_loraplus_optimizer(model, training_args, finetuning_args) + + if finetuning_args.use_badam: + return _create_badam_optimizer(model, training_args, finetuning_args) + + +def create_custom_scheduler( + training_args: "Seq2SeqTrainingArguments", + num_training_steps: int, + optimizer: Optional["torch.optim.Optimizer"] = None, +) -> None: + if optimizer is not None and isinstance(optimizer, DummyOptimizer): + optimizer_dict = optimizer.optimizer_dict + scheduler_dict: Dict["torch.nn.Parameter", "torch.optim.lr_scheduler.LRScheduler"] = {} + + for param in optimizer_dict.keys(): + scheduler_dict[param] = get_scheduler( + training_args.lr_scheduler_type, + optimizer=optimizer_dict[param], + num_warmup_steps=training_args.get_warmup_steps(num_training_steps), + num_training_steps=num_training_steps, + scheduler_specific_kwargs=training_args.lr_scheduler_kwargs, + ) + + def scheduler_hook(param: "torch.nn.Parameter"): + scheduler_dict[param].step() + + for param in optimizer_dict.keys(): + param.register_post_accumulate_grad_hook(scheduler_hook) + + +def get_batch_logps( + logits: "torch.Tensor", labels: "torch.Tensor", label_pad_token_id: int = IGNORE_INDEX +) -> Tuple["torch.Tensor", "torch.Tensor"]: + r""" + Computes the log probabilities of the given labels under the given logits. + + Returns: + logps: A tensor of shape (batch_size,) containing the sum of log probabilities. + valid_length: A tensor of shape (batch_size,) containing the number of non-masked tokens. + """ + if logits.shape[:-1] != labels.shape: + raise ValueError("Logits (batchsize x seqlen) and labels must have the same shape.") + + labels = labels[:, 1:].clone() + logits = logits[:, :-1, :] + loss_mask = labels != label_pad_token_id + labels[labels == label_pad_token_id] = 0 # dummy token + per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2) + return (per_token_logps * loss_mask).sum(-1), loss_mask.sum(-1) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/tuner.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/tuner.py similarity index 68% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/train/tuner.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/train/tuner.py index a1b7bec..eed875e 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/tuner.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/train/tuner.py @@ -7,8 +7,9 @@ from ..data import get_template_and_fix_tokenizer from ..extras.callbacks import LogCallback from ..extras.logging import get_logger from ..hparams import get_infer_args, get_train_args -from ..model import load_model_and_tokenizer +from ..model import load_model, load_tokenizer from .dpo import run_dpo +from .kto import run_kto from .ppo import run_ppo from .pt import run_pt from .rm import run_rm @@ -22,9 +23,9 @@ if TYPE_CHECKING: logger = get_logger(__name__) -def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["TrainerCallback"]] = None): +def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: List["TrainerCallback"] = []) -> None: model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args) - callbacks = [LogCallback()] if callbacks is None else callbacks + callbacks.append(LogCallback(training_args.output_dir)) if finetuning_args.stage == "pt": run_pt(model_args, data_args, training_args, finetuning_args, callbacks) @@ -36,21 +37,26 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra run_ppo(model_args, data_args, training_args, finetuning_args, generating_args, callbacks) elif finetuning_args.stage == "dpo": run_dpo(model_args, data_args, training_args, finetuning_args, callbacks) + elif finetuning_args.stage == "kto": + run_kto(model_args, data_args, training_args, finetuning_args, callbacks) else: raise ValueError("Unknown task.") -def export_model(args: Optional[Dict[str, Any]] = None): +def export_model(args: Optional[Dict[str, Any]] = None) -> None: model_args, data_args, finetuning_args, _ = get_infer_args(args) if model_args.export_dir is None: - raise ValueError("Please specify `export_dir`.") + raise ValueError("Please specify `export_dir` to save model.") if model_args.adapter_name_or_path is not None and model_args.export_quantization_bit is not None: raise ValueError("Please merge adapters before quantizing the model.") - model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args) + tokenizer_module = load_tokenizer(model_args) + tokenizer = tokenizer_module["tokenizer"] + processor = tokenizer_module["processor"] get_template_and_fix_tokenizer(tokenizer, data_args.template) + model = load_model(tokenizer, model_args, finetuning_args) # must after fixing tokenizer to resize vocab if getattr(model, "quantization_method", None) and model_args.adapter_name_or_path is not None: raise ValueError("Cannot merge adapters to a quantized model.") @@ -58,12 +64,11 @@ def export_model(args: Optional[Dict[str, Any]] = None): if not isinstance(model, PreTrainedModel): raise ValueError("The model is not a `PreTrainedModel`, export aborted.") - if getattr(model, "quantization_method", None): - model = model.to("cpu") - elif hasattr(model.config, "torch_dtype"): - model = model.to(getattr(model.config, "torch_dtype")).to("cpu") + if getattr(model, "quantization_method", None) is None: # cannot convert dtype of a quantized model + output_dtype = getattr(model.config, "torch_dtype", torch.float16) + setattr(model.config, "torch_dtype", output_dtype) + model = model.to(output_dtype) else: - model = model.to(torch.float16).to("cpu") setattr(model.config, "torch_dtype", torch.float16) model.save_pretrained( @@ -85,9 +90,13 @@ def export_model(args: Optional[Dict[str, Any]] = None): tokenizer.save_pretrained(model_args.export_dir) if model_args.export_hub_model_id is not None: tokenizer.push_to_hub(model_args.export_hub_model_id, token=model_args.hf_hub_token) + + if model_args.visual_inputs and processor is not None: + getattr(processor, "image_processor").save_pretrained(model_args.export_dir) + if model_args.export_hub_model_id is not None: + getattr(processor, "image_processor").push_to_hub( + model_args.export_hub_model_id, token=model_args.hf_hub_token + ) + except Exception: logger.warning("Cannot save tokenizer, please copy the files manually.") - - -if __name__ == "__main__": - run_exp() diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/chatter.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/chatter.py similarity index 54% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/chatter.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/chatter.py index d149ca2..c82710d 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/chatter.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/chatter.py @@ -1,13 +1,14 @@ import json import os -from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Sequence, Tuple +from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Sequence, Tuple -import gradio as gr -from gradio.components import Component # cannot use TYPE_CHECKING here +from numpy.typing import NDArray from ..chat import ChatModel from ..data import Role +from ..extras.constants import PEFT_METHODS from ..extras.misc import torch_gc +from ..extras.packages import is_gradio_available from .common import get_save_dir from .locales import ALERTS @@ -17,6 +18,10 @@ if TYPE_CHECKING: from .manager import Manager +if is_gradio_available(): + import gradio as gr + + class WebChatModel(ChatModel): def __init__(self, manager: "Manager", demo_mode: bool = False, lazy_init: bool = True) -> None: self.manager = manager @@ -29,21 +34,25 @@ class WebChatModel(ChatModel): if demo_mode and os.environ.get("DEMO_MODEL") and os.environ.get("DEMO_TEMPLATE"): # load demo model model_name_or_path = os.environ.get("DEMO_MODEL") template = os.environ.get("DEMO_TEMPLATE") - super().__init__(dict(model_name_or_path=model_name_or_path, template=template)) + infer_backend = os.environ.get("DEMO_BACKEND", "huggingface") + super().__init__( + dict(model_name_or_path=model_name_or_path, template=template, infer_backend=infer_backend) + ) @property def loaded(self) -> bool: return self.engine is not None - def load_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]: - get = lambda name: data[self.manager.get_elem_by_name(name)] - lang = get("top.lang") + def load_model(self, data) -> Generator[str, None, None]: + get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)] + lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path") + finetuning_type, checkpoint_path = get("top.finetuning_type"), get("top.checkpoint_path") error = "" if self.loaded: error = ALERTS["err_exists"][lang] - elif not get("top.model_name"): + elif not model_name: error = ALERTS["err_no_model"][lang] - elif not get("top.model_path"): + elif not model_path: error = ALERTS["err_no_path"][lang] elif self.demo_mode: error = ALERTS["err_demo"][lang] @@ -53,34 +62,32 @@ class WebChatModel(ChatModel): yield error return - if get("top.adapter_path"): - adapter_name_or_path = ",".join( - [ - get_save_dir(get("top.model_name"), get("top.finetuning_type"), adapter) - for adapter in get("top.adapter_path") - ] - ) - else: - adapter_name_or_path = None - yield ALERTS["info_loading"][lang] args = dict( - model_name_or_path=get("top.model_path"), - adapter_name_or_path=adapter_name_or_path, - finetuning_type=get("top.finetuning_type"), + model_name_or_path=model_path, + finetuning_type=finetuning_type, quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, template=get("top.template"), - flash_attn=(get("top.booster") == "flash_attn"), + flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", use_unsloth=(get("top.booster") == "unsloth"), + visual_inputs=get("top.visual_inputs"), rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, infer_backend=get("infer.infer_backend"), ) - super().__init__(args) + if checkpoint_path: + if finetuning_type in PEFT_METHODS: # list + args["adapter_name_or_path"] = ",".join( + [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path] + ) + else: # str + args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path) + + super().__init__(args) yield ALERTS["info_loaded"][lang] - def unload_model(self, data: Dict[Component, Any]) -> Generator[str, None, None]: - lang = data[self.manager.get_elem_by_name("top.lang")] + def unload_model(self, data) -> Generator[str, None, None]: + lang = data[self.manager.get_elem_by_id("top.lang")] if self.demo_mode: gr.Warning(ALERTS["err_demo"][lang]) @@ -92,23 +99,30 @@ class WebChatModel(ChatModel): torch_gc() yield ALERTS["info_unloaded"][lang] - def predict( + def append( self, - chatbot: List[Tuple[str, str]], + chatbot: List[List[Optional[str]]], + messages: Sequence[Dict[str, str]], role: str, query: str, - messages: Sequence[Tuple[str, str]], + ) -> Tuple[List[List[Optional[str]]], List[Dict[str, str]], str]: + return chatbot + [[query, None]], messages + [{"role": role, "content": query}], "" + + def stream( + self, + chatbot: List[List[Optional[str]]], + messages: Sequence[Dict[str, str]], system: str, tools: str, + image: Optional[NDArray], max_new_tokens: int, top_p: float, temperature: float, - ) -> Generator[Tuple[Sequence[Tuple[str, str]], Sequence[Tuple[str, str]]], None, None]: - chatbot.append([query, ""]) - query_messages = messages + [{"role": role, "content": query}] + ) -> Generator[Tuple[List[List[Optional[str]]], List[Dict[str, str]]], None, None]: + chatbot[-1][1] = "" response = "" for new_text in self.stream_chat( - query_messages, system, tools, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature + messages, system, tools, image, max_new_tokens=max_new_tokens, top_p=top_p, temperature=temperature ): response += new_text if tools: @@ -120,18 +134,11 @@ class WebChatModel(ChatModel): name, arguments = result arguments = json.loads(arguments) tool_call = json.dumps({"name": name, "arguments": arguments}, ensure_ascii=False) - output_messages = query_messages + [{"role": Role.FUNCTION.value, "content": tool_call}] + output_messages = messages + [{"role": Role.FUNCTION.value, "content": tool_call}] bot_text = "```json\n" + tool_call + "\n```" else: - output_messages = query_messages + [{"role": Role.ASSISTANT.value, "content": result}] + output_messages = messages + [{"role": Role.ASSISTANT.value, "content": result}] bot_text = result - chatbot[-1] = [query, self.postprocess(bot_text)] + chatbot[-1][1] = bot_text yield chatbot, output_messages - - def postprocess(self, response: str) -> str: - blocks = response.split("```") - for i, block in enumerate(blocks): - if i % 2 == 0: - blocks[i] = block.replace("<", "<").replace(">", ">") - return "```".join(blocks) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/common.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/common.py new file mode 100644 index 0000000..37b38df --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/common.py @@ -0,0 +1,176 @@ +import json +import os +from collections import defaultdict +from typing import Any, Dict, Optional, Tuple + +from yaml import safe_dump, safe_load + +from ..extras.constants import ( + CHECKPOINT_NAMES, + DATA_CONFIG, + DEFAULT_TEMPLATE, + PEFT_METHODS, + STAGES_USE_PAIR_DATA, + SUPPORTED_MODELS, + TRAINING_STAGES, + VISION_MODELS, + DownloadSource, +) +from ..extras.logging import get_logger +from ..extras.misc import use_modelscope +from ..extras.packages import is_gradio_available + + +if is_gradio_available(): + import gradio as gr + + +logger = get_logger(__name__) + + +DEFAULT_CACHE_DIR = "cache" +DEFAULT_CONFIG_DIR = "config" +DEFAULT_DATA_DIR = "data" +DEFAULT_SAVE_DIR = "saves" +USER_CONFIG = "user_config.yaml" + + +def get_save_dir(*paths: str) -> os.PathLike: + r""" + Gets the path to saved model checkpoints. + """ + paths = (path.replace(os.path.sep, "").replace(" ", "").strip() for path in paths) + return os.path.join(DEFAULT_SAVE_DIR, *paths) + + +def get_config_path() -> os.PathLike: + r""" + Gets the path to user config. + """ + return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG) + + +def load_config() -> Dict[str, Any]: + r""" + Loads user config if exists. + """ + try: + with open(get_config_path(), "r", encoding="utf-8") as f: + return safe_load(f) + except Exception: + return {"lang": None, "last_model": None, "path_dict": {}, "cache_dir": None} + + +def save_config(lang: str, model_name: Optional[str] = None, model_path: Optional[str] = None) -> None: + r""" + Saves user config. + """ + os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True) + user_config = load_config() + user_config["lang"] = lang or user_config["lang"] + if model_name: + user_config["last_model"] = model_name + + if model_name and model_path: + user_config["path_dict"][model_name] = model_path + + with open(get_config_path(), "w", encoding="utf-8") as f: + safe_dump(user_config, f) + + +def get_model_path(model_name: str) -> str: + r""" + Gets the model path according to the model name. + """ + user_config = load_config() + path_dict: Dict["DownloadSource", str] = SUPPORTED_MODELS.get(model_name, defaultdict(str)) + model_path = user_config["path_dict"].get(model_name, "") or path_dict.get(DownloadSource.DEFAULT, "") + if ( + use_modelscope() + and path_dict.get(DownloadSource.MODELSCOPE) + and model_path == path_dict.get(DownloadSource.DEFAULT) + ): # replace path + model_path = path_dict.get(DownloadSource.MODELSCOPE) + + return model_path + + +def get_prefix(model_name: str) -> str: + r""" + Gets the prefix of the model name to obtain the model family. + """ + return model_name.split("-")[0] + + +def get_model_info(model_name: str) -> Tuple[str, str, bool]: + r""" + Gets the necessary information of this model. + + Returns: + model_path (str) + template (str) + visual (bool) + """ + return get_model_path(model_name), get_template(model_name), get_visual(model_name) + + +def get_template(model_name: str) -> str: + r""" + Gets the template name if the model is a chat model. + """ + if model_name and model_name.endswith("Chat") and get_prefix(model_name) in DEFAULT_TEMPLATE: + return DEFAULT_TEMPLATE[get_prefix(model_name)] + return "default" + + +def get_visual(model_name: str) -> bool: + r""" + Judges if the model is a vision language model. + """ + return get_prefix(model_name) in VISION_MODELS + + +def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown": + r""" + Lists all available checkpoints. + """ + checkpoints = [] + if model_name: + save_dir = get_save_dir(model_name, finetuning_type) + if save_dir and os.path.isdir(save_dir): + for checkpoint in os.listdir(save_dir): + if os.path.isdir(os.path.join(save_dir, checkpoint)) and any( + os.path.isfile(os.path.join(save_dir, checkpoint, name)) for name in CHECKPOINT_NAMES + ): + checkpoints.append(checkpoint) + + if finetuning_type in PEFT_METHODS: + return gr.Dropdown(value=[], choices=checkpoints, multiselect=True) + else: + return gr.Dropdown(value=None, choices=checkpoints, multiselect=False) + + +def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]: + r""" + Loads dataset_info.json. + """ + if dataset_dir == "ONLINE": + logger.info("dataset_dir is ONLINE, using online dataset.") + return {} + + try: + with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: + return json.load(f) + except Exception as err: + logger.warning("Cannot open {} due to {}.".format(os.path.join(dataset_dir, DATA_CONFIG), str(err))) + return {} + + +def list_datasets(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown": + r""" + Lists all available datasets in the dataset dir for the training stage. + """ + dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR) + ranking = TRAINING_STAGES[training_stage] in STAGES_USE_PAIR_DATA + datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking] + return gr.Dropdown(choices=datasets) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/__init__.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/__init__.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/__init__.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/chatbot.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/chatbot.py new file mode 100644 index 0000000..f83694b --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/chatbot.py @@ -0,0 +1,74 @@ +from typing import TYPE_CHECKING, Dict, Tuple + +from ...data import Role +from ...extras.packages import is_gradio_available +from ..utils import check_json_schema + + +if is_gradio_available(): + import gradio as gr + + +if TYPE_CHECKING: + from gradio.components import Component + + from ..engine import Engine + + +def create_chat_box( + engine: "Engine", visible: bool = False +) -> Tuple["Component", "Component", Dict[str, "Component"]]: + with gr.Column(visible=visible) as chat_box: + chatbot = gr.Chatbot(show_copy_button=True) + messages = gr.State([]) + with gr.Row(): + with gr.Column(scale=4): + with gr.Row(): + with gr.Column(): + role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value) + system = gr.Textbox(show_label=False) + tools = gr.Textbox(show_label=False, lines=3) + + with gr.Column() as image_box: + image = gr.Image(sources=["upload"], type="numpy") + + query = gr.Textbox(show_label=False, lines=8) + submit_btn = gr.Button(variant="primary") + + with gr.Column(scale=1): + max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1) + top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01) + temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01) + clear_btn = gr.Button() + + tools.input(check_json_schema, inputs=[tools, engine.manager.get_elem_by_id("top.lang")]) + + submit_btn.click( + engine.chatter.append, + [chatbot, messages, role, query], + [chatbot, messages, query], + ).then( + engine.chatter.stream, + [chatbot, messages, system, tools, image, max_new_tokens, top_p, temperature], + [chatbot, messages], + ) + clear_btn.click(lambda: ([], []), outputs=[chatbot, messages]) + + return ( + chatbot, + messages, + dict( + chat_box=chat_box, + role=role, + system=system, + tools=tools, + image_box=image_box, + image=image, + query=query, + submit_btn=submit_btn, + max_new_tokens=max_new_tokens, + top_p=top_p, + temperature=temperature, + clear_btn=clear_btn, + ), + ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/data.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/data.py similarity index 62% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/data.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/data.py index c63b6ea..232b973 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/data.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/data.py @@ -1,10 +1,13 @@ import json import os -from typing import TYPE_CHECKING, Any, Dict, Tuple - -import gradio as gr +from typing import TYPE_CHECKING, Any, Dict, List, Tuple from ...extras.constants import DATA_CONFIG +from ...extras.packages import is_gradio_available + + +if is_gradio_available(): + import gradio as gr if TYPE_CHECKING: @@ -22,36 +25,46 @@ def next_page(page_index: int, total_num: int) -> int: return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index -def can_preview(dataset_dir: str, dataset: list) -> Dict[str, Any]: +def can_preview(dataset_dir: str, dataset: list) -> "gr.Button": try: with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: dataset_info = json.load(f) except Exception: - return gr.update(interactive=False) + return gr.Button(interactive=False) - if ( - len(dataset) > 0 - and "file_name" in dataset_info[dataset[0]] - and os.path.isfile(os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])) - ): - return gr.update(interactive=True) + if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]: + return gr.Button(interactive=False) + + data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]) + if os.path.isfile(data_path) or (os.path.isdir(data_path) and os.listdir(data_path)): + return gr.Button(interactive=True) else: - return gr.update(interactive=False) + return gr.Button(interactive=False) -def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, Dict[str, Any]]: +def _load_data_file(file_path: str) -> List[Any]: + with open(file_path, "r", encoding="utf-8") as f: + if file_path.endswith(".json"): + return json.load(f) + elif file_path.endswith(".jsonl"): + return [json.loads(line) for line in f] + else: + return list(f) + + +def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]: with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: dataset_info = json.load(f) - data_file: str = dataset_info[dataset[0]]["file_name"] - with open(os.path.join(dataset_dir, data_file), "r", encoding="utf-8") as f: - if data_file.endswith(".json"): - data = json.load(f) - elif data_file.endswith(".jsonl"): - data = [json.loads(line) for line in f] - else: - data = [line for line in f] # noqa: C416 - return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.update(visible=True) + data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"]) + if os.path.isfile(data_path): + data = _load_data_file(data_path) + else: + data = [] + for file_name in os.listdir(data_path): + data.extend(_load_data_file(os.path.join(data_path, file_name))) + + return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True) def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]: @@ -67,7 +80,7 @@ def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dic close_btn = gr.Button() with gr.Row(): - preview_samples = gr.JSON(interactive=False) + preview_samples = gr.JSON() dataset.change(can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False).then( lambda: 0, outputs=[page_index], queue=False @@ -81,7 +94,7 @@ def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dic next_btn.click(next_page, [page_index, preview_count], [page_index], queue=False).then( get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False ) - close_btn.click(lambda: gr.update(visible=False), outputs=[preview_box], queue=False) + close_btn.click(lambda: gr.Column(visible=False), outputs=[preview_box], queue=False) return dict( data_preview_btn=data_preview_btn, preview_count=preview_count, diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/eval.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/eval.py similarity index 62% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/eval.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/eval.py index 4c35ad8..0a7a0f4 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/eval.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/eval.py @@ -1,11 +1,14 @@ from typing import TYPE_CHECKING, Dict -import gradio as gr - -from ..common import DEFAULT_DATA_DIR, list_dataset +from ...extras.packages import is_gradio_available +from ..common import DEFAULT_DATA_DIR, list_datasets from .data import create_preview_box +if is_gradio_available(): + import gradio as gr + + if TYPE_CHECKING: from gradio.components import Component @@ -18,27 +21,25 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]: with gr.Row(): dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2) - dataset = gr.Dropdown(multiselect=True, scale=4) + dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4) preview_elems = create_preview_box(dataset_dir, dataset) - dataset_dir.change(list_dataset, [dataset_dir], [dataset], queue=False) - input_elems.update({dataset_dir, dataset}) elem_dict.update(dict(dataset_dir=dataset_dir, dataset=dataset, **preview_elems)) with gr.Row(): - cutoff_len = gr.Slider(value=1024, minimum=4, maximum=8192, step=1) + cutoff_len = gr.Slider(minimum=4, maximum=65536, value=1024, step=1) max_samples = gr.Textbox(value="100000") - batch_size = gr.Slider(value=8, minimum=1, maximum=512, step=1) + batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1) predict = gr.Checkbox(value=True) input_elems.update({cutoff_len, max_samples, batch_size, predict}) elem_dict.update(dict(cutoff_len=cutoff_len, max_samples=max_samples, batch_size=batch_size, predict=predict)) with gr.Row(): - max_new_tokens = gr.Slider(10, 2048, value=128, step=1) - top_p = gr.Slider(0.01, 1, value=0.7, step=0.01) - temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01) + max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1) + top_p = gr.Slider(minimum=0.01, maximum=1, value=0.7, step=0.01) + temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01) output_dir = gr.Textbox() input_elems.update({max_new_tokens, top_p, temperature, output_dir}) @@ -46,31 +47,33 @@ def create_eval_tab(engine: "Engine") -> Dict[str, "Component"]: with gr.Row(): cmd_preview_btn = gr.Button() - start_btn = gr.Button() - stop_btn = gr.Button() + start_btn = gr.Button(variant="primary") + stop_btn = gr.Button(variant="stop") with gr.Row(): - resume_btn = gr.Checkbox(visible=False, interactive=False, value=False) - process_bar = gr.Slider(visible=False, interactive=False) + resume_btn = gr.Checkbox(visible=False, interactive=False) + progress_bar = gr.Slider(visible=False, interactive=False) - with gr.Box(): + with gr.Row(): output_box = gr.Markdown() - output_elems = [output_box, process_bar] elem_dict.update( dict( cmd_preview_btn=cmd_preview_btn, start_btn=start_btn, stop_btn=stop_btn, resume_btn=resume_btn, - process_bar=process_bar, + progress_bar=progress_bar, output_box=output_box, ) ) + output_elems = [output_box, progress_bar] - cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems) + cmd_preview_btn.click(engine.runner.preview_eval, input_elems, output_elems, concurrency_limit=None) start_btn.click(engine.runner.run_eval, input_elems, output_elems) - stop_btn.click(engine.runner.set_abort, queue=False) - resume_btn.change(engine.runner.monitor, outputs=output_elems) + stop_btn.click(engine.runner.set_abort) + resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None) + + dataset.focus(list_datasets, [dataset_dir], [dataset], queue=False) return elem_dict diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/export.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/export.py similarity index 57% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/export.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/export.py index a40590c..7e1493c 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/export.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/export.py @@ -1,12 +1,17 @@ -from typing import TYPE_CHECKING, Dict, Generator, List +from typing import TYPE_CHECKING, Dict, Generator, List, Union -import gradio as gr - -from ...train import export_model +from ...extras.constants import PEFT_METHODS +from ...extras.misc import torch_gc +from ...extras.packages import is_gradio_available +from ...train.tuner import export_model from ..common import get_save_dir from ..locales import ALERTS +if is_gradio_available(): + import gradio as gr + + if TYPE_CHECKING: from gradio.components import Component @@ -20,12 +25,14 @@ def save_model( lang: str, model_name: str, model_path: str, - adapter_path: List[str], finetuning_type: str, + checkpoint_path: Union[str, List[str]], template: str, - max_shard_size: int, + visual_inputs: bool, + export_size: int, export_quantization_bit: int, export_quantization_dataset: str, + export_device: str, export_legacy_format: bool, export_dir: str, export_hub_model_id: str, @@ -39,44 +46,50 @@ def save_model( error = ALERTS["err_no_export_dir"][lang] elif export_quantization_bit in GPTQ_BITS and not export_quantization_dataset: error = ALERTS["err_no_dataset"][lang] - elif export_quantization_bit not in GPTQ_BITS and not adapter_path: + elif export_quantization_bit not in GPTQ_BITS and not checkpoint_path: error = ALERTS["err_no_adapter"][lang] + elif export_quantization_bit in GPTQ_BITS and isinstance(checkpoint_path, list): + error = ALERTS["err_gptq_lora"][lang] if error: gr.Warning(error) yield error return - if adapter_path: - adapter_name_or_path = ",".join( - [get_save_dir(model_name, finetuning_type, adapter) for adapter in adapter_path] - ) - else: - adapter_name_or_path = None - args = dict( model_name_or_path=model_path, - adapter_name_or_path=adapter_name_or_path, finetuning_type=finetuning_type, template=template, + visual_inputs=visual_inputs, export_dir=export_dir, export_hub_model_id=export_hub_model_id or None, - export_size=max_shard_size, + export_size=export_size, export_quantization_bit=int(export_quantization_bit) if export_quantization_bit in GPTQ_BITS else None, export_quantization_dataset=export_quantization_dataset, + export_device=export_device, export_legacy_format=export_legacy_format, ) + if checkpoint_path: + if finetuning_type in PEFT_METHODS: # list + args["adapter_name_or_path"] = ",".join( + [get_save_dir(model_name, finetuning_type, adapter) for adapter in checkpoint_path] + ) + else: # str + args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, checkpoint_path) + yield ALERTS["info_exporting"][lang] export_model(args) + torch_gc() yield ALERTS["info_exported"][lang] def create_export_tab(engine: "Engine") -> Dict[str, "Component"]: with gr.Row(): - max_shard_size = gr.Slider(value=1, minimum=1, maximum=100) - export_quantization_bit = gr.Dropdown(choices=["none", "8", "4", "3", "2"], value="none") + export_size = gr.Slider(minimum=1, maximum=100, value=1, step=1) + export_quantization_bit = gr.Dropdown(choices=["none"] + GPTQ_BITS, value="none") export_quantization_dataset = gr.Textbox(value="data/c4_demo.json") + export_device = gr.Radio(choices=["cpu", "auto"], value="cpu") export_legacy_format = gr.Checkbox() with gr.Row(): @@ -89,15 +102,17 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]: export_btn.click( save_model, [ - engine.manager.get_elem_by_name("top.lang"), - engine.manager.get_elem_by_name("top.model_name"), - engine.manager.get_elem_by_name("top.model_path"), - engine.manager.get_elem_by_name("top.adapter_path"), - engine.manager.get_elem_by_name("top.finetuning_type"), - engine.manager.get_elem_by_name("top.template"), - max_shard_size, + engine.manager.get_elem_by_id("top.lang"), + engine.manager.get_elem_by_id("top.model_name"), + engine.manager.get_elem_by_id("top.model_path"), + engine.manager.get_elem_by_id("top.finetuning_type"), + engine.manager.get_elem_by_id("top.checkpoint_path"), + engine.manager.get_elem_by_id("top.template"), + engine.manager.get_elem_by_id("top.visual_inputs"), + export_size, export_quantization_bit, export_quantization_dataset, + export_device, export_legacy_format, export_dir, export_hub_model_id, @@ -106,9 +121,10 @@ def create_export_tab(engine: "Engine") -> Dict[str, "Component"]: ) return dict( - max_shard_size=max_shard_size, + export_size=export_size, export_quantization_bit=export_quantization_bit, export_quantization_dataset=export_quantization_dataset, + export_device=export_device, export_legacy_format=export_legacy_format, export_dir=export_dir, export_hub_model_id=export_hub_model_id, diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/infer.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/infer.py similarity index 57% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/infer.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/infer.py index 135535a..970f462 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/infer.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/infer.py @@ -1,10 +1,13 @@ from typing import TYPE_CHECKING, Dict -import gradio as gr - +from ...extras.packages import is_gradio_available from .chatbot import create_chat_box +if is_gradio_available(): + import gradio as gr + + if TYPE_CHECKING: from gradio.components import Component @@ -25,15 +28,21 @@ def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]: input_elems.update({infer_backend}) elem_dict.update(dict(infer_backend=infer_backend, load_btn=load_btn, unload_btn=unload_btn, info_box=info_box)) - chat_box, chatbot, history, chat_elems = create_chat_box(engine, visible=False) - elem_dict.update(dict(chat_box=chat_box, **chat_elems)) + chatbot, messages, chat_elems = create_chat_box(engine, visible=False) + elem_dict.update(chat_elems) load_btn.click(engine.chatter.load_model, input_elems, [info_box]).then( - lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box] + lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]] ) unload_btn.click(engine.chatter.unload_model, input_elems, [info_box]).then( - lambda: ([], []), outputs=[chatbot, history] - ).then(lambda: gr.update(visible=engine.chatter.loaded), outputs=[chat_box]) + lambda: ([], []), outputs=[chatbot, messages] + ).then(lambda: gr.Column(visible=engine.chatter.loaded), outputs=[chat_elems["chat_box"]]) + + engine.manager.get_elem_by_id("top.visual_inputs").change( + lambda enabled: gr.Column(visible=enabled), + [engine.manager.get_elem_by_id("top.visual_inputs")], + [chat_elems["image_box"]], + ) return elem_dict diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/top.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/top.py new file mode 100644 index 0000000..fd0ead3 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/top.py @@ -0,0 +1,56 @@ +from typing import TYPE_CHECKING, Dict + +from ...data import TEMPLATES +from ...extras.constants import METHODS, SUPPORTED_MODELS +from ...extras.packages import is_gradio_available +from ..common import get_model_info, list_checkpoints, save_config +from ..utils import can_quantize + + +if is_gradio_available(): + import gradio as gr + + +if TYPE_CHECKING: + from gradio.components import Component + + +def create_top() -> Dict[str, "Component"]: + available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"] + + with gr.Row(): + lang = gr.Dropdown(choices=["en", "ru", "zh"], scale=1) + model_name = gr.Dropdown(choices=available_models, scale=3) + model_path = gr.Textbox(scale=3) + + with gr.Row(): + finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1) + checkpoint_path = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=6) + + with gr.Accordion(open=False) as advanced_tab: + with gr.Row(): + quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none", scale=2) + template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="default", scale=2) + rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none", scale=3) + booster = gr.Radio(choices=["none", "flashattn2", "unsloth"], value="none", scale=3) + visual_inputs = gr.Checkbox(scale=1) + + model_name.change(get_model_info, [model_name], [model_path, template, visual_inputs], queue=False) + model_name.input(save_config, inputs=[lang, model_name], queue=False) + model_path.input(save_config, inputs=[lang, model_name, model_path], queue=False) + finetuning_type.change(can_quantize, [finetuning_type], [quantization_bit], queue=False) + checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False) + + return dict( + lang=lang, + model_name=model_name, + model_path=model_path, + finetuning_type=finetuning_type, + checkpoint_path=checkpoint_path, + advanced_tab=advanced_tab, + quantization_bit=quantization_bit, + template=template, + rope_scaling=rope_scaling, + booster=booster, + visual_inputs=visual_inputs, + ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/train.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/train.py new file mode 100644 index 0000000..72dfc85 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/components/train.py @@ -0,0 +1,331 @@ +from typing import TYPE_CHECKING, Dict + +from transformers.trainer_utils import SchedulerType + +from ...extras.constants import TRAINING_STAGES +from ...extras.misc import get_device_count +from ...extras.packages import is_gradio_available +from ..common import DEFAULT_DATA_DIR, list_checkpoints, list_datasets +from ..utils import change_stage, list_config_paths, list_output_dirs +from .data import create_preview_box + + +if is_gradio_available(): + import gradio as gr + + +if TYPE_CHECKING: + from gradio.components import Component + + from ..engine import Engine + + +def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: + input_elems = engine.manager.get_base_elems() + elem_dict = dict() + + with gr.Row(): + training_stage = gr.Dropdown( + choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1 + ) + dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1) + dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4) + preview_elems = create_preview_box(dataset_dir, dataset) + + input_elems.update({training_stage, dataset_dir, dataset}) + elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems)) + + with gr.Row(): + learning_rate = gr.Textbox(value="5e-5") + num_train_epochs = gr.Textbox(value="3.0") + max_grad_norm = gr.Textbox(value="1.0") + max_samples = gr.Textbox(value="100000") + compute_type = gr.Dropdown(choices=["fp16", "bf16", "fp32", "pure_bf16"], value="fp16") + + input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type}) + elem_dict.update( + dict( + learning_rate=learning_rate, + num_train_epochs=num_train_epochs, + max_grad_norm=max_grad_norm, + max_samples=max_samples, + compute_type=compute_type, + ) + ) + + with gr.Row(): + cutoff_len = gr.Slider(minimum=4, maximum=65536, value=1024, step=1) + batch_size = gr.Slider(minimum=1, maximum=1024, value=2, step=1) + gradient_accumulation_steps = gr.Slider(minimum=1, maximum=1024, value=8, step=1) + val_size = gr.Slider(minimum=0, maximum=1, value=0, step=0.001) + lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="cosine") + + input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type}) + elem_dict.update( + dict( + cutoff_len=cutoff_len, + batch_size=batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + val_size=val_size, + lr_scheduler_type=lr_scheduler_type, + ) + ) + + with gr.Accordion(open=False) as extra_tab: + with gr.Row(): + logging_steps = gr.Slider(minimum=1, maximum=1000, value=5, step=5) + save_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10) + warmup_steps = gr.Slider(minimum=0, maximum=5000, value=0, step=1) + neftune_alpha = gr.Slider(minimum=0, maximum=10, value=0, step=0.1) + optim = gr.Textbox(value="adamw_torch") + + with gr.Row(): + with gr.Column(): + resize_vocab = gr.Checkbox() + packing = gr.Checkbox() + + with gr.Column(): + upcast_layernorm = gr.Checkbox() + use_llama_pro = gr.Checkbox() + + with gr.Column(): + shift_attn = gr.Checkbox() + report_to = gr.Checkbox() + + input_elems.update( + { + logging_steps, + save_steps, + warmup_steps, + neftune_alpha, + optim, + resize_vocab, + packing, + upcast_layernorm, + use_llama_pro, + shift_attn, + report_to, + } + ) + elem_dict.update( + dict( + extra_tab=extra_tab, + logging_steps=logging_steps, + save_steps=save_steps, + warmup_steps=warmup_steps, + neftune_alpha=neftune_alpha, + optim=optim, + resize_vocab=resize_vocab, + packing=packing, + upcast_layernorm=upcast_layernorm, + use_llama_pro=use_llama_pro, + shift_attn=shift_attn, + report_to=report_to, + ) + ) + + with gr.Accordion(open=False) as freeze_tab: + with gr.Row(): + freeze_trainable_layers = gr.Slider(minimum=-128, maximum=128, value=2, step=1) + freeze_trainable_modules = gr.Textbox(value="all") + freeze_extra_modules = gr.Textbox() + + input_elems.update({freeze_trainable_layers, freeze_trainable_modules, freeze_extra_modules}) + elem_dict.update( + dict( + freeze_tab=freeze_tab, + freeze_trainable_layers=freeze_trainable_layers, + freeze_trainable_modules=freeze_trainable_modules, + freeze_extra_modules=freeze_extra_modules, + ) + ) + + with gr.Accordion(open=False) as lora_tab: + with gr.Row(): + lora_rank = gr.Slider(minimum=1, maximum=1024, value=8, step=1) + lora_alpha = gr.Slider(minimum=1, maximum=2048, value=16, step=1) + lora_dropout = gr.Slider(minimum=0, maximum=1, value=0, step=0.01) + loraplus_lr_ratio = gr.Slider(minimum=0, maximum=64, value=0, step=0.01) + create_new_adapter = gr.Checkbox() + + with gr.Row(): + with gr.Column(scale=1): + use_rslora = gr.Checkbox() + use_dora = gr.Checkbox() + + lora_target = gr.Textbox(scale=2) + additional_target = gr.Textbox(scale=2) + + input_elems.update( + { + lora_rank, + lora_alpha, + lora_dropout, + loraplus_lr_ratio, + create_new_adapter, + use_rslora, + use_dora, + lora_target, + additional_target, + } + ) + elem_dict.update( + dict( + lora_tab=lora_tab, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + loraplus_lr_ratio=loraplus_lr_ratio, + create_new_adapter=create_new_adapter, + use_rslora=use_rslora, + use_dora=use_dora, + lora_target=lora_target, + additional_target=additional_target, + ) + ) + + with gr.Accordion(open=False) as rlhf_tab: + with gr.Row(): + pref_beta = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.01) + pref_ftx = gr.Slider(minimum=0, maximum=10, value=0, step=0.01) + pref_loss = gr.Dropdown(choices=["sigmoid", "hinge", "ipo", "kto_pair", "orpo", "simpo"], value="sigmoid") + reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True) + with gr.Column(): + ppo_score_norm = gr.Checkbox() + ppo_whiten_rewards = gr.Checkbox() + + input_elems.update({pref_beta, pref_ftx, pref_loss, reward_model, ppo_score_norm, ppo_whiten_rewards}) + elem_dict.update( + dict( + rlhf_tab=rlhf_tab, + pref_beta=pref_beta, + pref_ftx=pref_ftx, + pref_loss=pref_loss, + reward_model=reward_model, + ppo_score_norm=ppo_score_norm, + ppo_whiten_rewards=ppo_whiten_rewards, + ) + ) + + with gr.Accordion(open=False) as galore_tab: + with gr.Row(): + use_galore = gr.Checkbox() + galore_rank = gr.Slider(minimum=1, maximum=1024, value=16, step=1) + galore_update_interval = gr.Slider(minimum=1, maximum=1024, value=200, step=1) + galore_scale = gr.Slider(minimum=0, maximum=1, value=0.25, step=0.01) + galore_target = gr.Textbox(value="all") + + input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target}) + elem_dict.update( + dict( + galore_tab=galore_tab, + use_galore=use_galore, + galore_rank=galore_rank, + galore_update_interval=galore_update_interval, + galore_scale=galore_scale, + galore_target=galore_target, + ) + ) + + with gr.Accordion(open=False) as badam_tab: + with gr.Row(): + use_badam = gr.Checkbox() + badam_mode = gr.Dropdown(choices=["layer", "ratio"], value="layer") + badam_switch_mode = gr.Dropdown(choices=["ascending", "descending", "random", "fixed"], value="ascending") + badam_switch_interval = gr.Slider(minimum=1, maximum=1024, value=50, step=1) + badam_update_ratio = gr.Slider(minimum=0, maximum=1, value=0.05, step=0.01) + + input_elems.update({use_badam, badam_mode, badam_switch_mode, badam_switch_interval, badam_update_ratio}) + elem_dict.update( + dict( + badam_tab=badam_tab, + use_badam=use_badam, + badam_mode=badam_mode, + badam_switch_mode=badam_switch_mode, + badam_switch_interval=badam_switch_interval, + badam_update_ratio=badam_update_ratio, + ) + ) + + with gr.Row(): + cmd_preview_btn = gr.Button() + arg_save_btn = gr.Button() + arg_load_btn = gr.Button() + start_btn = gr.Button(variant="primary") + stop_btn = gr.Button(variant="stop") + + with gr.Row(): + with gr.Column(scale=3): + with gr.Row(): + current_time = gr.Textbox(visible=False, interactive=False) + output_dir = gr.Dropdown(allow_custom_value=True) + config_path = gr.Dropdown(allow_custom_value=True) + + with gr.Row(): + device_count = gr.Textbox(value=str(get_device_count() or 1), interactive=False) + ds_stage = gr.Dropdown(choices=["none", "2", "3"], value="none") + ds_offload = gr.Checkbox() + + with gr.Row(): + resume_btn = gr.Checkbox(visible=False, interactive=False) + progress_bar = gr.Slider(visible=False, interactive=False) + + with gr.Row(): + output_box = gr.Markdown() + + with gr.Column(scale=1): + loss_viewer = gr.Plot() + + input_elems.update({output_dir, config_path, device_count, ds_stage, ds_offload}) + elem_dict.update( + dict( + cmd_preview_btn=cmd_preview_btn, + arg_save_btn=arg_save_btn, + arg_load_btn=arg_load_btn, + start_btn=start_btn, + stop_btn=stop_btn, + current_time=current_time, + output_dir=output_dir, + config_path=config_path, + device_count=device_count, + ds_stage=ds_stage, + ds_offload=ds_offload, + resume_btn=resume_btn, + progress_bar=progress_bar, + output_box=output_box, + loss_viewer=loss_viewer, + ) + ) + output_elems = [output_box, progress_bar, loss_viewer] + + cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems, concurrency_limit=None) + start_btn.click(engine.runner.run_train, input_elems, output_elems) + stop_btn.click(engine.runner.set_abort) + resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None) + + lang = engine.manager.get_elem_by_id("top.lang") + model_name: "gr.Dropdown" = engine.manager.get_elem_by_id("top.model_name") + finetuning_type: "gr.Dropdown" = engine.manager.get_elem_by_id("top.finetuning_type") + + arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None) + arg_load_btn.click( + engine.runner.load_args, [lang, config_path], list(input_elems) + [output_box], concurrency_limit=None + ) + + dataset.focus(list_datasets, [dataset_dir, training_stage], [dataset], queue=False) + training_stage.change(change_stage, [training_stage], [dataset, packing], queue=False) + reward_model.focus(list_checkpoints, [model_name, finetuning_type], [reward_model], queue=False) + model_name.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False) + finetuning_type.change(list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], queue=False) + output_dir.change( + list_output_dirs, [model_name, finetuning_type, current_time], [output_dir], concurrency_limit=None + ) + output_dir.input( + engine.runner.check_output_dir, + [lang, model_name, finetuning_type, output_dir], + list(input_elems) + [output_box], + concurrency_limit=None, + ) + config_path.change(list_config_paths, [current_time], [config_path], queue=False) + + return elem_dict diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/css.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/css.py similarity index 100% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/css.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/css.py diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/engine.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/engine.py new file mode 100644 index 0000000..eb6142d --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/engine.py @@ -0,0 +1,67 @@ +from typing import TYPE_CHECKING, Any, Dict + +from .chatter import WebChatModel +from .common import load_config +from .locales import LOCALES +from .manager import Manager +from .runner import Runner +from .utils import create_ds_config, get_time + + +if TYPE_CHECKING: + from gradio.components import Component + + +class Engine: + def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None: + self.demo_mode = demo_mode + self.pure_chat = pure_chat + self.manager = Manager() + self.runner = Runner(self.manager, demo_mode) + self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat)) + if not demo_mode: + create_ds_config() + + def _update_component(self, input_dict: Dict[str, Dict[str, Any]]) -> Dict["Component", "Component"]: + r""" + Gets the dict to update the components. + """ + output_dict: Dict["Component", "Component"] = {} + for elem_id, elem_attr in input_dict.items(): + elem = self.manager.get_elem_by_id(elem_id) + output_dict[elem] = elem.__class__(**elem_attr) + + return output_dict + + def resume(self): + user_config = load_config() if not self.demo_mode else {} + lang = user_config.get("lang", None) or "en" + + init_dict = {"top.lang": {"value": lang}, "infer.chat_box": {"visible": self.chatter.loaded}} + + if not self.pure_chat: + current_time = get_time() + init_dict["train.current_time"] = {"value": current_time} + init_dict["train.output_dir"] = {"value": "train_{}".format(current_time)} + init_dict["train.config_path"] = {"value": "{}.yaml".format(current_time)} + init_dict["eval.output_dir"] = {"value": "eval_{}".format(current_time)} + init_dict["infer.image_box"] = {"visible": False} + + if user_config.get("last_model", None): + init_dict["top.model_name"] = {"value": user_config["last_model"]} + + yield self._update_component(init_dict) + + if self.runner.running and not self.demo_mode and not self.pure_chat: + yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()} + if self.runner.do_train: + yield self._update_component({"train.resume_btn": {"value": True}}) + else: + yield self._update_component({"eval.resume_btn": {"value": True}}) + + def change_lang(self, lang: str): + return { + elem: elem.__class__(**LOCALES[elem_name][lang]) + for elem_name, elem in self.manager.get_elem_iter() + if elem_name in LOCALES + } diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/interface.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/interface.py new file mode 100644 index 0000000..bae3ba7 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/interface.py @@ -0,0 +1,82 @@ +import os + +from ..extras.packages import is_gradio_available +from .common import save_config +from .components import ( + create_chat_box, + create_eval_tab, + create_export_tab, + create_infer_tab, + create_top, + create_train_tab, +) +from .css import CSS +from .engine import Engine + + +if is_gradio_available(): + import gradio as gr + + +def create_ui(demo_mode: bool = False) -> gr.Blocks: + engine = Engine(demo_mode=demo_mode, pure_chat=False) + + with gr.Blocks(title="LLaMA Board", css=CSS) as demo: + if demo_mode: + gr.HTML("

LLaMA Board: A One-stop Web UI for Getting Started with LLaMA Factory

") + gr.HTML( + '

Visit ' + "LLaMA Factory for details.

" + ) + gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") + + engine.manager.add_elems("top", create_top()) + lang: "gr.Dropdown" = engine.manager.get_elem_by_id("top.lang") + + with gr.Tab("Train"): + engine.manager.add_elems("train", create_train_tab(engine)) + + with gr.Tab("Evaluate & Predict"): + engine.manager.add_elems("eval", create_eval_tab(engine)) + + with gr.Tab("Chat"): + engine.manager.add_elems("infer", create_infer_tab(engine)) + + if not demo_mode: + with gr.Tab("Export"): + engine.manager.add_elems("export", create_export_tab(engine)) + + demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None) + lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False) + lang.input(save_config, inputs=[lang], queue=False) + + return demo + + +def create_web_demo() -> gr.Blocks: + engine = Engine(pure_chat=True) + + with gr.Blocks(title="Web Demo", css=CSS) as demo: + lang = gr.Dropdown(choices=["en", "zh"]) + engine.manager.add_elems("top", dict(lang=lang)) + + _, _, chat_elems = create_chat_box(engine, visible=True) + engine.manager.add_elems("infer", chat_elems) + + demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None) + lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False) + lang.input(save_config, inputs=[lang], queue=False) + + return demo + + +def run_web_ui() -> None: + gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"] + server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") + create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True) + + +def run_web_demo() -> None: + gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"] + server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") + create_web_demo().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/locales.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/locales.py similarity index 75% rename from src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/locales.py rename to src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/locales.py index 4f329e8..e30feab 100644 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/locales.py +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/locales.py @@ -46,26 +46,15 @@ LOCALES = { "label": "微调方法", }, }, - "adapter_path": { + "checkpoint_path": { "en": { - "label": "Adapter path", + "label": "Checkpoint path", }, "ru": { - "label": "Путь к адаптеру", + "label": "Путь контрольной точки", }, "zh": { - "label": "适配器路径", - }, - }, - "refresh_btn": { - "en": { - "value": "Refresh adapters", - }, - "ru": { - "value": "Обновить адаптеры", - }, - "zh": { - "value": "刷新适配器", + "label": "检查点路径", }, }, "advanced_tab": { @@ -129,6 +118,17 @@ LOCALES = { "label": "加速方式", }, }, + "visual_inputs": { + "en": { + "label": "Visual inputs", + }, + "ru": { + "label": "визуальные входы", + }, + "zh": { + "label": "图像输入", + }, + }, "training_stage": { "en": { "label": "Stage", @@ -536,6 +536,20 @@ LOCALES = { "info": "使用 LongLoRA 提出的 shift short attention。", }, }, + "report_to": { + "en": { + "label": "Enable external logger", + "info": "Use TensorBoard or wandb to log experiment.", + }, + "ru": { + "label": "Включить внешний регистратор", + "info": "Использовать TensorBoard или wandb для ведения журнала экспериментов.", + }, + "zh": { + "label": "启用外部记录面板", + "info": "使用 TensorBoard 或 wandb 记录实验。", + }, + }, "freeze_tab": { "en": { "label": "Freeze tuning configurations", @@ -547,24 +561,24 @@ LOCALES = { "label": "部分参数微调设置", }, }, - "num_layer_trainable": { + "freeze_trainable_layers": { "en": { "label": "Trainable layers", - "info": "The number of trainable layers.", + "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.", }, "ru": { "label": "Обучаемые слои", - "info": "Количество обучаемых слоев.", + "info": "Количество последних (+)/первых (-) скрытых слоев, которые будут установлены как обучаемые.", }, "zh": { "label": "可训练层数", - "info": "可训练模型层的数量。", + "info": "最末尾(+)/最前端(-)可训练隐藏层的数量。", }, }, - "name_module_trainable": { + "freeze_trainable_modules": { "en": { "label": "Trainable modules", - "info": "The name of trainable modules. Use commas to separate multiple modules.", + "info": "Name(s) of trainable modules. Use commas to separate multiple modules.", }, "ru": { "label": "Обучаемые модули", @@ -575,6 +589,26 @@ LOCALES = { "info": "可训练模块的名称。使用英文逗号分隔多个名称。", }, }, + "freeze_extra_modules": { + "en": { + "label": "Extra modules (optional)", + "info": ( + "Name(s) of modules apart from hidden layers to be set as trainable. " + "Use commas to separate multiple modules." + ), + }, + "ru": { + "label": "Дополнительные модули (опционально)", + "info": ( + "Имена модулей, кроме скрытых слоев, которые следует установить в качестве обучаемых. " + "Используйте запятые для разделения нескольких модулей." + ), + }, + "zh": { + "label": "额外模块(非必填)", + "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。", + }, + }, "lora_tab": { "en": { "label": "LoRA configurations", @@ -628,18 +662,32 @@ LOCALES = { "info": "LoRA 权重随机丢弃的概率。", }, }, - "lora_target": { + "loraplus_lr_ratio": { "en": { - "label": "LoRA modules (optional)", - "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.", + "label": "LoRA+ LR ratio", + "info": "The LR ratio of the B matrices in LoRA.", }, "ru": { - "label": "Модули LoRA (опционально)", - "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.", + "label": "LoRA+ LR коэффициент", + "info": "Коэффициент LR матриц B в LoRA.", }, "zh": { - "label": "LoRA 作用模块(非必填)", - "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。", + "label": "LoRA+ 学习率比例", + "info": "LoRA+ 中 B 矩阵的学习率倍数。", + }, + }, + "create_new_adapter": { + "en": { + "label": "Create new adapter", + "info": "Create a new adapter with randomly initialized weight upon the existing one.", + }, + "ru": { + "label": "Создать новый адаптер", + "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.", + }, + "zh": { + "label": "新建适配器", + "info": "在现有的适配器上创建一个随机初始化后的新适配器。", }, }, "use_rslora": { @@ -670,18 +718,18 @@ LOCALES = { "info": "使用权重分解的 LoRA。", }, }, - "create_new_adapter": { + "lora_target": { "en": { - "label": "Create new adapter", - "info": "Create a new adapter with randomly initialized weight upon the existing one.", + "label": "LoRA modules (optional)", + "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.", }, "ru": { - "label": "Создать новый адаптер", - "info": "Создать новый адаптер с случайной инициализацией веса на основе существующего.", + "label": "Модули LoRA (опционально)", + "info": "Имена модулей для применения LoRA. Используйте запятые для разделения нескольких модулей.", }, "zh": { - "label": "新建适配器", - "info": "在现有的适配器上创建一个随机初始化后的新适配器。", + "label": "LoRA 作用模块(非必填)", + "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。", }, }, "additional_target": { @@ -715,38 +763,52 @@ LOCALES = { "label": "RLHF 参数设置", }, }, - "dpo_beta": { + "pref_beta": { "en": { - "label": "DPO beta", - "info": "Value of the beta parameter in the DPO loss.", + "label": "Beta value", + "info": "Value of the beta parameter in the loss.", }, "ru": { - "label": "DPO бета", - "info": "Значение параметра бета в функции потерь DPO.", + "label": "Бета значение", + "info": "Значение параметра бета в функции потерь.", }, "zh": { - "label": "DPO beta 参数", - "info": "DPO 损失函数中 beta 超参数大小。", + "label": "Beta 参数", + "info": "损失函数中 beta 超参数大小。", }, }, - "dpo_ftx": { + "pref_ftx": { "en": { - "label": "DPO-ftx weight", - "info": "The weight of SFT loss in the DPO-ftx.", + "label": "Ftx gamma", + "info": "The weight of SFT loss in the final loss.", }, "ru": { - "label": "Вес DPO-ftx", - "info": "Вес функции потерь SFT в DPO-ftx.", + "label": "Ftx гамма", + "info": "Вес потери SFT в итоговой потере.", }, "zh": { - "label": "DPO-ftx 权重", - "info": "DPO-ftx 中 SFT 损失的权重大小。", + "label": "Ftx gamma", + "info": "损失函数中 SFT 损失的权重大小。", + }, + }, + "pref_loss": { + "en": { + "label": "Loss type", + "info": "The type of the loss function.", + }, + "ru": { + "label": "Тип потерь", + "info": "Тип функции потерь.", + }, + "zh": { + "label": "损失类型", + "info": "损失函数的类型。", }, }, "reward_model": { "en": { "label": "Reward model", - "info": "Adapter of the reward model for PPO training.", + "info": "Adapter of the reward model in PPO training.", }, "ru": { "label": "Модель вознаграждения", @@ -757,6 +819,34 @@ LOCALES = { "info": "PPO 训练中奖励模型的适配器路径。", }, }, + "ppo_score_norm": { + "en": { + "label": "Score norm", + "info": "Normalizing scores in PPO training.", + }, + "ru": { + "label": "Норма оценок", + "info": "Нормализация оценок в тренировке PPO.", + }, + "zh": { + "label": "奖励模型", + "info": "PPO 训练中归一化奖励分数。", + }, + }, + "ppo_whiten_rewards": { + "en": { + "label": "Whiten rewards", + "info": "Whiten the rewards in PPO training.", + }, + "ru": { + "label": "Белые вознаграждения", + "info": "Осветлите вознаграждения в обучении PPO.", + }, + "zh": { + "label": "白化奖励", + "info": "PPO 训练中将奖励分数做白化处理。", + }, + }, "galore_tab": { "en": { "label": "GaLore configurations", @@ -838,6 +928,87 @@ LOCALES = { "info": "应用 GaLore 的模块名称。使用英文逗号分隔多个名称。", }, }, + "badam_tab": { + "en": { + "label": "BAdam configurations", + }, + "ru": { + "label": "Конфигурации BAdam", + }, + "zh": { + "label": "BAdam 参数设置", + }, + }, + "use_badam": { + "en": { + "label": "Use BAdam", + "info": "Enable the BAdam optimizer.", + }, + "ru": { + "label": "Использовать BAdam", + "info": "Включите оптимизатор BAdam.", + }, + "zh": { + "label": "使用 BAdam", + "info": "使用 BAdam 优化器。", + }, + }, + "badam_mode": { + "en": { + "label": "BAdam mode", + "info": "Whether to use layer-wise or ratio-wise BAdam optimizer.", + }, + "ru": { + "label": "Режим BAdam", + "info": "Использовать ли оптимизатор BAdam с послоевой или пропорциональной настройкой.", + }, + "zh": { + "label": "BAdam 模式", + "info": "使用 layer-wise 或 ratio-wise BAdam 优化器。", + }, + }, + "badam_switch_mode": { + "en": { + "label": "Switch mode", + "info": "The strategy of picking block to update for layer-wise BAdam.", + }, + "ru": { + "label": "Режим переключения", + "info": "Стратегия выбора блока для обновления для послойного BAdam.", + }, + "zh": { + "label": "切换策略", + "info": "Layer-wise BAdam 优化器的块切换策略。", + }, + }, + "badam_switch_interval": { + "en": { + "label": "Switch interval", + "info": "Number of steps to update the block for layer-wise BAdam.", + }, + "ru": { + "label": "Интервал переключения", + "info": "количество шагов для обновления блока для пошагового BAdam.", + }, + "zh": { + "label": "切换频率", + "info": "Layer-wise BAdam 优化器的块切换频率。", + }, + }, + "badam_update_ratio": { + "en": { + "label": "Update ratio", + "info": "The ratio of the update for ratio-wise BAdam.", + }, + "ru": { + "label": "Коэффициент обновления", + "info": "Коэффициент обновления для BAdam с учётом соотношений.", + }, + "zh": { + "label": "Block 更新比例", + "info": "Ratio-wise BAdam 优化器的更新比例。", + }, + }, "cmd_preview_btn": { "en": { "value": "Preview command", @@ -849,6 +1020,28 @@ LOCALES = { "value": "预览命令", }, }, + "arg_save_btn": { + "en": { + "value": "Save arguments", + }, + "ru": { + "value": "Сохранить аргументы", + }, + "zh": { + "value": "保存训练参数", + }, + }, + "arg_load_btn": { + "en": { + "value": "Load arguments", + }, + "ru": { + "value": "Загрузить аргументы", + }, + "zh": { + "value": "载入训练参数", + }, + }, "start_btn": { "en": { "value": "Start", @@ -885,6 +1078,62 @@ LOCALES = { "info": "保存结果的路径。", }, }, + "config_path": { + "en": { + "label": "Config path", + "info": "Path to config saving arguments.", + }, + "ru": { + "label": "Путь к конфигурации", + "info": "Путь для сохранения аргументов конфигурации.", + }, + "zh": { + "label": "配置路径", + "info": "保存训练参数的配置文件路径。", + }, + }, + "device_count": { + "en": { + "label": "Device count", + "info": "Number of devices available.", + }, + "ru": { + "label": "Количество устройств", + "info": "Количество доступных устройств.", + }, + "zh": { + "label": "设备数量", + "info": "当前可用的运算设备数。", + }, + }, + "ds_stage": { + "en": { + "label": "DeepSpeed stage", + "info": "DeepSpeed stage for distributed training.", + }, + "ru": { + "label": "Этап DeepSpeed", + "info": "Этап DeepSpeed для распределенного обучения.", + }, + "zh": { + "label": "DeepSpeed stage", + "info": "多卡训练的 DeepSpeed stage。", + }, + }, + "ds_offload": { + "en": { + "label": "Enable offload", + "info": "Enable DeepSpeed offload (slow down training).", + }, + "ru": { + "label": "Включить выгрузку", + "info": "включить выгрузку DeepSpeed (замедлит обучение).", + }, + "zh": { + "label": "使用 offload", + "info": "使用 DeepSpeed offload(会减慢速度)。", + }, + }, "output_box": { "en": { "value": "Ready.", @@ -995,6 +1244,17 @@ LOCALES = { "placeholder": "工具列表(非必填)", }, }, + "image": { + "en": { + "label": "Image (optional)", + }, + "ru": { + "label": "Изображение (по желанию)", + }, + "zh": { + "label": "图像(非必填)", + }, + }, "query": { "en": { "placeholder": "Input...", @@ -1072,7 +1332,7 @@ LOCALES = { "value": "清空历史", }, }, - "max_shard_size": { + "export_size": { "en": { "label": "Max shard size (GB)", "info": "The maximum size for a model file.", @@ -1114,6 +1374,20 @@ LOCALES = { "info": "量化过程中使用的校准数据集。", }, }, + "export_device": { + "en": { + "label": "Export device", + "info": "Which device should be used to export model.", + }, + "ru": { + "label": "Экспорт устройство", + "info": "Какое устройство следует использовать для экспорта модели.", + }, + "zh": { + "label": "导出设备", + "info": "导出模型使用的设备类型。", + }, + }, "export_legacy_format": { "en": { "label": "Export legacy format", @@ -1201,6 +1475,11 @@ ALERTS = { "ru": "Пожалуйста, выберите адаптер.", "zh": "请选择适配器。", }, + "err_no_output_dir": { + "en": "Please provide output dir.", + "ru": "Пожалуйста, укажите выходную директорию.", + "zh": "请填写输出目录。", + }, "err_no_reward_model": { "en": "Please select a reward model.", "ru": "Пожалуйста, выберите модель вознаграждения.", @@ -1209,7 +1488,12 @@ ALERTS = { "err_no_export_dir": { "en": "Please provide export dir.", "ru": "Пожалуйста, укажите каталог для экспорта.", - "zh": "请填写导出目录", + "zh": "请填写导出目录。", + }, + "err_gptq_lora": { + "en": "Please merge adapters before quantizing the model.", + "ru": "Пожалуйста, объедините адаптеры перед квантованием модели.", + "zh": "量化模型前请先合并适配器。", }, "err_failed": { "en": "Failed.", @@ -1221,11 +1505,6 @@ ALERTS = { "ru": "Обучение недоступно в демонстрационном режиме, сначала скопируйте пространство в частное.", "zh": "展示模式不支持训练,请先复制到私人空间。", }, - "err_device_count": { - "en": "Multiple GPUs are not supported yet.", - "ru": "Пока не поддерживается множественные GPU.", - "zh": "尚不支持多 GPU 训练。", - }, "err_tool_name": { "en": "Tool name not found.", "ru": "Имя инструмента не найдено.", @@ -1236,15 +1515,25 @@ ALERTS = { "ru": "Неверная схема JSON.", "zh": "Json 格式错误。", }, + "err_config_not_found": { + "en": "Config file is not found.", + "ru": "Файл конфигурации не найден.", + "zh": "未找到配置文件。", + }, "warn_no_cuda": { "en": "CUDA environment was not detected.", "ru": "Среда CUDA не обнаружена.", "zh": "未检测到 CUDA 环境。", }, + "warn_output_dir_exists": { + "en": "Output dir already exists, will resume training from here.", + "ru": "Выходной каталог уже существует, обучение будет продолжено отсюда.", + "zh": "输出目录已存在,将从该断点恢复训练。", + }, "info_aborting": { "en": "Aborted, wait for terminating...", "ru": "Прервано, ожидание завершения...", - "zh": "训练中断,正在等待线程结束……", + "zh": "训练中断,正在等待进程结束……", }, "info_aborted": { "en": "Ready.", @@ -1256,6 +1545,16 @@ ALERTS = { "ru": "Завершено.", "zh": "训练完毕。", }, + "info_config_saved": { + "en": "Arguments have been saved at: ", + "ru": "Аргументы были сохранены по адресу: ", + "zh": "训练参数已保存至:", + }, + "info_config_loaded": { + "en": "Arguments have been restored.", + "ru": "Аргументы были восстановлены.", + "zh": "训练参数已载入。", + }, "info_loading": { "en": "Loading model...", "ru": "Загрузка модели...", diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/manager.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/manager.py new file mode 100644 index 0000000..326fdb8 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/manager.py @@ -0,0 +1,64 @@ +from typing import TYPE_CHECKING, Dict, Generator, List, Set, Tuple + + +if TYPE_CHECKING: + from gradio.components import Component + + +class Manager: + def __init__(self) -> None: + self._id_to_elem: Dict[str, "Component"] = {} + self._elem_to_id: Dict["Component", str] = {} + + def add_elems(self, tab_name: str, elem_dict: Dict[str, "Component"]) -> None: + r""" + Adds elements to manager. + """ + for elem_name, elem in elem_dict.items(): + elem_id = "{}.{}".format(tab_name, elem_name) + self._id_to_elem[elem_id] = elem + self._elem_to_id[elem] = elem_id + + def get_elem_list(self) -> List["Component"]: + r""" + Returns the list of all elements. + """ + return list(self._id_to_elem.values()) + + def get_elem_iter(self) -> Generator[Tuple[str, "Component"], None, None]: + r""" + Returns an iterator over all elements with their names. + """ + for elem_id, elem in self._id_to_elem.items(): + yield elem_id.split(".")[-1], elem + + def get_elem_by_id(self, elem_id: str) -> "Component": + r""" + Gets element by id. + + Example: top.lang, train.dataset + """ + return self._id_to_elem[elem_id] + + def get_id_by_elem(self, elem: "Component") -> str: + r""" + Gets id by element. + """ + return self._elem_to_id[elem] + + def get_base_elems(self) -> Set["Component"]: + r""" + Gets the base elements that are commonly used. + """ + return { + self._id_to_elem["top.lang"], + self._id_to_elem["top.model_name"], + self._id_to_elem["top.model_path"], + self._id_to_elem["top.finetuning_type"], + self._id_to_elem["top.checkpoint_path"], + self._id_to_elem["top.quantization_bit"], + self._id_to_elem["top.template"], + self._id_to_elem["top.rope_scaling"], + self._id_to_elem["top.booster"], + self._id_to_elem["top.visual_inputs"], + } diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/runner.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/runner.py new file mode 100644 index 0000000..3501462 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/runner.py @@ -0,0 +1,406 @@ +import os +from copy import deepcopy +from subprocess import Popen, TimeoutExpired +from typing import TYPE_CHECKING, Any, Dict, Generator, Optional + +from transformers.trainer import TRAINING_ARGS_NAME + +from ..extras.constants import LLAMABOARD_CONFIG, PEFT_METHODS, TRAINING_STAGES +from ..extras.misc import is_gpu_or_npu_available, torch_gc +from ..extras.packages import is_gradio_available +from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir, load_config +from .locales import ALERTS, LOCALES +from .utils import abort_leaf_process, gen_cmd, get_eval_results, get_trainer_info, load_args, save_args, save_cmd + + +if is_gradio_available(): + import gradio as gr + + +if TYPE_CHECKING: + from gradio.components import Component + + from .manager import Manager + + +class Runner: + def __init__(self, manager: "Manager", demo_mode: bool = False) -> None: + self.manager = manager + self.demo_mode = demo_mode + """ Resume """ + self.trainer: Optional["Popen"] = None + self.do_train = True + self.running_data: Dict["Component", Any] = None + """ State """ + self.aborted = False + self.running = False + + def set_abort(self) -> None: + self.aborted = True + if self.trainer is not None: + abort_leaf_process(self.trainer.pid) + + def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str: + get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)] + lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path") + dataset = get("train.dataset") if do_train else get("eval.dataset") + + if self.running: + return ALERTS["err_conflict"][lang] + + if not model_name: + return ALERTS["err_no_model"][lang] + + if not model_path: + return ALERTS["err_no_path"][lang] + + if not dataset: + return ALERTS["err_no_dataset"][lang] + + if not from_preview and self.demo_mode: + return ALERTS["err_demo"][lang] + + if do_train: + if not get("train.output_dir"): + return ALERTS["err_no_output_dir"][lang] + + stage = TRAINING_STAGES[get("train.training_stage")] + if stage == "ppo" and not get("train.reward_model"): + return ALERTS["err_no_reward_model"][lang] + else: + if not get("eval.output_dir"): + return ALERTS["err_no_output_dir"][lang] + + if not from_preview and not is_gpu_or_npu_available(): + gr.Warning(ALERTS["warn_no_cuda"][lang]) + + return "" + + def _finalize(self, lang: str, finish_info: str) -> str: + finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info + self.trainer = None + self.aborted = False + self.running = False + self.running_data = None + torch_gc() + return finish_info + + def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]: + get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)] + model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type") + user_config = load_config() + + args = dict( + stage=TRAINING_STAGES[get("train.training_stage")], + do_train=True, + model_name_or_path=get("top.model_path"), + cache_dir=user_config.get("cache_dir", None), + preprocessing_num_workers=16, + finetuning_type=finetuning_type, + quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + template=get("top.template"), + rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, + flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", + use_unsloth=(get("top.booster") == "unsloth"), + visual_inputs=get("top.visual_inputs"), + dataset_dir=get("train.dataset_dir"), + dataset=",".join(get("train.dataset")), + cutoff_len=get("train.cutoff_len"), + learning_rate=float(get("train.learning_rate")), + num_train_epochs=float(get("train.num_train_epochs")), + max_samples=int(get("train.max_samples")), + per_device_train_batch_size=get("train.batch_size"), + gradient_accumulation_steps=get("train.gradient_accumulation_steps"), + lr_scheduler_type=get("train.lr_scheduler_type"), + max_grad_norm=float(get("train.max_grad_norm")), + logging_steps=get("train.logging_steps"), + save_steps=get("train.save_steps"), + warmup_steps=get("train.warmup_steps"), + neftune_noise_alpha=get("train.neftune_alpha") or None, + optim=get("train.optim"), + resize_vocab=get("train.resize_vocab"), + packing=get("train.packing"), + upcast_layernorm=get("train.upcast_layernorm"), + use_llama_pro=get("train.use_llama_pro"), + shift_attn=get("train.shift_attn"), + report_to="all" if get("train.report_to") else "none", + use_galore=get("train.use_galore"), + use_badam=get("train.use_badam"), + output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")), + fp16=(get("train.compute_type") == "fp16"), + bf16=(get("train.compute_type") == "bf16"), + pure_bf16=(get("train.compute_type") == "pure_bf16"), + plot_loss=True, + ddp_timeout=180000000, + include_num_input_tokens_seen=True, + ) + + # checkpoints + if get("top.checkpoint_path"): + if finetuning_type in PEFT_METHODS: # list + args["adapter_name_or_path"] = ",".join( + [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")] + ) + else: # str + args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path")) + + # freeze config + if args["finetuning_type"] == "freeze": + args["freeze_trainable_layers"] = get("train.freeze_trainable_layers") + args["freeze_trainable_modules"] = get("train.freeze_trainable_modules") + args["freeze_extra_modules"] = get("train.freeze_extra_modules") or None + + # lora config + if args["finetuning_type"] == "lora": + args["lora_rank"] = get("train.lora_rank") + args["lora_alpha"] = get("train.lora_alpha") + args["lora_dropout"] = get("train.lora_dropout") + args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio") or None + args["create_new_adapter"] = get("train.create_new_adapter") + args["use_rslora"] = get("train.use_rslora") + args["use_dora"] = get("train.use_dora") + args["lora_target"] = get("train.lora_target") or "all" + args["additional_target"] = get("train.additional_target") or None + + if args["use_llama_pro"]: + args["num_layer_trainable"] = get("train.num_layer_trainable") + + # rlhf config + if args["stage"] == "ppo": + if finetuning_type in PEFT_METHODS: + args["reward_model"] = ",".join( + [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("train.reward_model")] + ) + else: + args["reward_model"] = get_save_dir(model_name, finetuning_type, get("train.reward_model")) + + args["reward_model_type"] = "lora" if finetuning_type == "lora" else "full" + args["ppo_score_norm"] = get("train.ppo_score_norm") + args["ppo_whiten_rewards"] = get("train.ppo_whiten_rewards") + args["top_k"] = 0 + args["top_p"] = 0.9 + elif args["stage"] in ["dpo", "kto"]: + args["pref_beta"] = get("train.pref_beta") + args["pref_ftx"] = get("train.pref_ftx") + args["pref_loss"] = get("train.pref_loss") + + # galore config + if args["use_galore"]: + args["galore_rank"] = get("train.galore_rank") + args["galore_update_interval"] = get("train.galore_update_interval") + args["galore_scale"] = get("train.galore_scale") + args["galore_target"] = get("train.galore_target") + + # badam config + if args["use_badam"]: + args["badam_mode"] = get("train.badam_mode") + args["badam_switch_mode"] = get("train.badam_switch_mode") + args["badam_switch_interval"] = get("train.badam_switch_interval") + args["badam_update_ratio"] = get("train.badam_update_ratio") + + # eval config + if get("train.val_size") > 1e-6 and args["stage"] != "ppo": + args["val_size"] = get("train.val_size") + args["eval_strategy"] = "steps" + args["eval_steps"] = args["save_steps"] + args["per_device_eval_batch_size"] = args["per_device_train_batch_size"] + + # ds config + if get("train.ds_stage") != "none": + ds_stage = get("train.ds_stage") + ds_offload = "offload_" if get("train.ds_offload") else "" + args["deepspeed"] = os.path.join(DEFAULT_CACHE_DIR, "ds_z{}_{}config.json".format(ds_stage, ds_offload)) + + return args + + def _parse_eval_args(self, data: Dict["Component", Any]) -> Dict[str, Any]: + get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)] + model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type") + user_config = load_config() + + args = dict( + stage="sft", + model_name_or_path=get("top.model_path"), + cache_dir=user_config.get("cache_dir", None), + preprocessing_num_workers=16, + finetuning_type=finetuning_type, + quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, + template=get("top.template"), + rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, + flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto", + use_unsloth=(get("top.booster") == "unsloth"), + visual_inputs=get("top.visual_inputs"), + dataset_dir=get("eval.dataset_dir"), + dataset=",".join(get("eval.dataset")), + cutoff_len=get("eval.cutoff_len"), + max_samples=int(get("eval.max_samples")), + per_device_eval_batch_size=get("eval.batch_size"), + predict_with_generate=True, + max_new_tokens=get("eval.max_new_tokens"), + top_p=get("eval.top_p"), + temperature=get("eval.temperature"), + output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")), + ) + + if get("eval.predict"): + args["do_predict"] = True + else: + args["do_eval"] = True + + if get("top.checkpoint_path"): + if finetuning_type in PEFT_METHODS: # list + args["adapter_name_or_path"] = ",".join( + [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")] + ) + else: # str + args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path")) + + return args + + def _preview(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict["Component", str], None, None]: + output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval")) + error = self._initialize(data, do_train, from_preview=True) + if error: + gr.Warning(error) + yield {output_box: error} + else: + args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) + yield {output_box: gen_cmd(args)} + + def _launch(self, data: Dict["Component", Any], do_train: bool) -> Generator[Dict["Component", Any], None, None]: + output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if do_train else "eval")) + error = self._initialize(data, do_train, from_preview=False) + if error: + gr.Warning(error) + yield {output_box: error} + else: + self.do_train, self.running_data = do_train, data + args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) + + os.makedirs(args["output_dir"], exist_ok=True) + save_args(os.path.join(args["output_dir"], LLAMABOARD_CONFIG), self._form_config_dict(data)) + + env = deepcopy(os.environ) + env["LLAMABOARD_ENABLED"] = "1" + if args.get("deepspeed", None) is not None: + env["FORCE_TORCHRUN"] = "1" + + self.trainer = Popen("llamafactory-cli train {}".format(save_cmd(args)), env=env, shell=True) + yield from self.monitor() + + def _form_config_dict(self, data: Dict["Component", Any]) -> Dict[str, Any]: + config_dict = {} + skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path", "train.device_count"] + for elem, value in data.items(): + elem_id = self.manager.get_id_by_elem(elem) + if elem_id not in skip_ids: + config_dict[elem_id] = value + + return config_dict + + def preview_train(self, data): + yield from self._preview(data, do_train=True) + + def preview_eval(self, data): + yield from self._preview(data, do_train=False) + + def run_train(self, data): + yield from self._launch(data, do_train=True) + + def run_eval(self, data): + yield from self._launch(data, do_train=False) + + def monitor(self): + self.aborted = False + self.running = True + + get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)] + lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type") + output_dir = get("{}.output_dir".format("train" if self.do_train else "eval")) + output_path = get_save_dir(model_name, finetuning_type, output_dir) + + output_box = self.manager.get_elem_by_id("{}.output_box".format("train" if self.do_train else "eval")) + progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train" if self.do_train else "eval")) + loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None + + while self.trainer is not None: + if self.aborted: + yield { + output_box: ALERTS["info_aborting"][lang], + progress_bar: gr.Slider(visible=False), + } + else: + running_log, running_progress, running_loss = get_trainer_info(output_path, self.do_train) + return_dict = { + output_box: running_log, + progress_bar: running_progress, + } + if running_loss is not None: + return_dict[loss_viewer] = running_loss + + yield return_dict + + try: + self.trainer.wait(2) + self.trainer = None + except TimeoutExpired: + continue + + if self.do_train: + if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)): + finish_info = ALERTS["info_finished"][lang] + else: + finish_info = ALERTS["err_failed"][lang] + else: + if os.path.exists(os.path.join(output_path, "all_results.json")): + finish_info = get_eval_results(os.path.join(output_path, "all_results.json")) + else: + finish_info = ALERTS["err_failed"][lang] + + return_dict = { + output_box: self._finalize(lang, finish_info), + progress_bar: gr.Slider(visible=False), + } + yield return_dict + + def save_args(self, data): + output_box = self.manager.get_elem_by_id("train.output_box") + error = self._initialize(data, do_train=True, from_preview=True) + if error: + gr.Warning(error) + return {output_box: error} + + lang = data[self.manager.get_elem_by_id("top.lang")] + config_path = data[self.manager.get_elem_by_id("train.config_path")] + os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True) + save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path) + + save_args(save_path, self._form_config_dict(data)) + return {output_box: ALERTS["info_config_saved"][lang] + save_path} + + def load_args(self, lang: str, config_path: str): + output_box = self.manager.get_elem_by_id("train.output_box") + config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path)) + if config_dict is None: + gr.Warning(ALERTS["err_config_not_found"][lang]) + return {output_box: ALERTS["err_config_not_found"][lang]} + + output_dict: Dict["Component", Any] = {output_box: ALERTS["info_config_loaded"][lang]} + for elem_id, value in config_dict.items(): + output_dict[self.manager.get_elem_by_id(elem_id)] = value + + return output_dict + + def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str): + output_box = self.manager.get_elem_by_id("train.output_box") + output_dict: Dict["Component", Any] = {output_box: LOCALES["output_box"][lang]["value"]} + if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)): + gr.Warning(ALERTS["warn_output_dir_exists"][lang]) + output_dict[output_box] = ALERTS["warn_output_dir_exists"][lang] + + output_dir = get_save_dir(model_name, finetuning_type, output_dir) + config_dict = load_args(os.path.join(output_dir, LLAMABOARD_CONFIG)) # load llamaboard config + for elem_id, value in config_dict.items(): + output_dict[self.manager.get_elem_by_id(elem_id)] = value + + return output_dict diff --git a/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/utils.py b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/utils.py new file mode 100644 index 0000000..e39f2aa --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/llamafactory/webui/utils.py @@ -0,0 +1,263 @@ +import json +import os +import signal +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple + +import psutil +from transformers.trainer_utils import get_last_checkpoint +from yaml import safe_dump, safe_load + +from ..extras.constants import PEFT_METHODS, RUNNING_LOG, TRAINER_LOG, TRAINING_ARGS, TRAINING_STAGES +from ..extras.packages import is_gradio_available, is_matplotlib_available +from ..extras.ploting import gen_loss_plot +from .common import DEFAULT_CACHE_DIR, DEFAULT_CONFIG_DIR, get_save_dir +from .locales import ALERTS + + +if is_gradio_available(): + import gradio as gr + + +def abort_leaf_process(pid: int) -> None: + r""" + Aborts the leaf processes. + """ + children = psutil.Process(pid).children() + if children: + for child in children: + abort_leaf_process(child.pid) + else: + os.kill(pid, signal.SIGABRT) + + +def can_quantize(finetuning_type: str) -> "gr.Dropdown": + r""" + Judges if the quantization is available in this finetuning type. + """ + if finetuning_type not in PEFT_METHODS: + return gr.Dropdown(value="none", interactive=False) + else: + return gr.Dropdown(interactive=True) + + +def change_stage(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Tuple[List[str], bool]: + r""" + Modifys states after changing the training stage. + """ + return [], TRAINING_STAGES[training_stage] == "pt" + + +def check_json_schema(text: str, lang: str) -> None: + r""" + Checks if the json schema is valid. + """ + try: + tools = json.loads(text) + if tools: + assert isinstance(tools, list) + for tool in tools: + if "name" not in tool: + raise NotImplementedError("Name not found.") + except NotImplementedError: + gr.Warning(ALERTS["err_tool_name"][lang]) + except Exception: + gr.Warning(ALERTS["err_json_schema"][lang]) + + +def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]: + r""" + Removes args with NoneType or False or empty string value. + """ + no_skip_keys = ["packing"] + return {k: v for k, v in args.items() if (k in no_skip_keys) or (v is not None and v is not False and v != "")} + + +def gen_cmd(args: Dict[str, Any]) -> str: + r""" + Generates arguments for previewing. + """ + cmd_lines = ["llamafactory-cli train "] + for k, v in clean_cmd(args).items(): + cmd_lines.append(" --{} {} ".format(k, str(v))) + + cmd_text = "\\\n".join(cmd_lines) + cmd_text = "```bash\n{}\n```".format(cmd_text) + return cmd_text + + +def save_cmd(args: Dict[str, Any]) -> str: + r""" + Saves arguments to launch training. + """ + output_dir = args["output_dir"] + os.makedirs(output_dir, exist_ok=True) + + with open(os.path.join(output_dir, TRAINING_ARGS), "w", encoding="utf-8") as f: + safe_dump(clean_cmd(args), f) + + return os.path.join(output_dir, TRAINING_ARGS) + + +def get_eval_results(path: os.PathLike) -> str: + r""" + Gets scores after evaluation. + """ + with open(path, "r", encoding="utf-8") as f: + result = json.dumps(json.load(f), indent=4) + return "```json\n{}\n```\n".format(result) + + +def get_time() -> str: + r""" + Gets current date and time. + """ + return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S") + + +def get_trainer_info(output_path: os.PathLike, do_train: bool) -> Tuple[str, "gr.Slider", Optional["gr.Plot"]]: + r""" + Gets training infomation for monitor. + """ + running_log = "" + running_progress = gr.Slider(visible=False) + running_loss = None + + running_log_path = os.path.join(output_path, RUNNING_LOG) + if os.path.isfile(running_log_path): + with open(running_log_path, "r", encoding="utf-8") as f: + running_log = f.read() + + trainer_log_path = os.path.join(output_path, TRAINER_LOG) + if os.path.isfile(trainer_log_path): + trainer_log: List[Dict[str, Any]] = [] + with open(trainer_log_path, "r", encoding="utf-8") as f: + for line in f: + trainer_log.append(json.loads(line)) + + if len(trainer_log) != 0: + latest_log = trainer_log[-1] + percentage = latest_log["percentage"] + label = "Running {:d}/{:d}: {} < {}".format( + latest_log["current_steps"], + latest_log["total_steps"], + latest_log["elapsed_time"], + latest_log["remaining_time"], + ) + running_progress = gr.Slider(label=label, value=percentage, visible=True) + + if do_train and is_matplotlib_available(): + running_loss = gr.Plot(gen_loss_plot(trainer_log)) + + return running_log, running_progress, running_loss + + +def load_args(config_path: str) -> Optional[Dict[str, Any]]: + r""" + Loads saved arguments. + """ + try: + with open(config_path, "r", encoding="utf-8") as f: + return safe_load(f) + except Exception: + return None + + +def save_args(config_path: str, config_dict: Dict[str, Any]): + r""" + Saves arguments. + """ + with open(config_path, "w", encoding="utf-8") as f: + safe_dump(config_dict, f) + + +def list_config_paths(current_time: str) -> "gr.Dropdown": + r""" + Lists all the saved configuration files. + """ + config_files = ["{}.yaml".format(current_time)] + if os.path.isdir(DEFAULT_CONFIG_DIR): + for file_name in os.listdir(DEFAULT_CONFIG_DIR): + if file_name.endswith(".yaml") and file_name not in config_files: + config_files.append(file_name) + + return gr.Dropdown(choices=config_files) + + +def list_output_dirs(model_name: Optional[str], finetuning_type: str, current_time: str) -> "gr.Dropdown": + r""" + Lists all the directories that can resume from. + """ + output_dirs = ["train_{}".format(current_time)] + if model_name: + save_dir = get_save_dir(model_name, finetuning_type) + if save_dir and os.path.isdir(save_dir): + for folder in os.listdir(save_dir): + output_dir = os.path.join(save_dir, folder) + if os.path.isdir(output_dir) and get_last_checkpoint(output_dir) is not None: + output_dirs.append(folder) + + return gr.Dropdown(choices=output_dirs) + + +def create_ds_config() -> None: + r""" + Creates deepspeed config. + """ + os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True) + ds_config = { + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": True, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1, + }, + "bf16": {"enabled": "auto"}, + } + offload_config = { + "device": "cpu", + "pin_memory": True, + } + ds_config["zero_optimization"] = { + "stage": 2, + "allgather_partitions": True, + "allgather_bucket_size": 5e8, + "overlap_comm": True, + "reduce_scatter": True, + "reduce_bucket_size": 5e8, + "contiguous_gradients": True, + "round_robin_gradients": True, + } + with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_config.json"), "w", encoding="utf-8") as f: + json.dump(ds_config, f, indent=2) + + ds_config["zero_optimization"]["offload_optimizer"] = offload_config + with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z2_offload_config.json"), "w", encoding="utf-8") as f: + json.dump(ds_config, f, indent=2) + + ds_config["zero_optimization"] = { + "stage": 3, + "overlap_comm": True, + "contiguous_gradients": True, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": True, + } + with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_config.json"), "w", encoding="utf-8") as f: + json.dump(ds_config, f, indent=2) + + ds_config["zero_optimization"]["offload_optimizer"] = offload_config + ds_config["zero_optimization"]["offload_param"] = offload_config + with open(os.path.join(DEFAULT_CACHE_DIR, "ds_z3_offload_config.json"), "w", encoding="utf-8") as f: + json.dump(ds_config, f, indent=2) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/__init__.py deleted file mode 100644 index 027f9ed..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# Level: api, webui > chat, eval, train > data, model > extras, hparams - -from .api import create_app -from .chat import ChatModel -from .eval import Evaluator -from .train import export_model, run_exp -from .webui import create_ui, create_web_demo - - -__version__ = "0.5.3" -__all__ = ["create_app", "ChatModel", "Evaluator", "export_model", "run_exp", "create_ui", "create_web_demo"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/__init__.py deleted file mode 100644 index d7059fb..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .app import create_app - - -__all__ = ["create_app"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/app.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/app.py deleted file mode 100644 index c5a18bc..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/api/app.py +++ /dev/null @@ -1,224 +0,0 @@ -import json -import os -from contextlib import asynccontextmanager -from typing import Any, Dict, Sequence - -from pydantic import BaseModel - -from ..chat import ChatModel -from ..data import Role as DataRole -from ..extras.misc import torch_gc -from ..extras.packages import is_fastapi_availble, is_starlette_available, is_uvicorn_available -from .protocol import ( - ChatCompletionMessage, - ChatCompletionRequest, - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, - ChatCompletionResponseUsage, - ChatCompletionStreamResponse, - Finish, - Function, - FunctionCall, - ModelCard, - ModelList, - Role, - ScoreEvaluationRequest, - ScoreEvaluationResponse, -) - - -if is_fastapi_availble(): - from fastapi import FastAPI, HTTPException, status - from fastapi.middleware.cors import CORSMiddleware - - -if is_starlette_available(): - from sse_starlette import EventSourceResponse - - -if is_uvicorn_available(): - import uvicorn - - -@asynccontextmanager -async def lifespan(app: "FastAPI"): # collects GPU memory - yield - torch_gc() - - -def dictify(data: "BaseModel") -> Dict[str, Any]: - try: # pydantic v2 - return data.model_dump(exclude_unset=True) - except AttributeError: # pydantic v1 - return data.dict(exclude_unset=True) - - -def jsonify(data: "BaseModel") -> str: - try: # pydantic v2 - return json.dumps(data.model_dump(exclude_unset=True), ensure_ascii=False) - except AttributeError: # pydantic v1 - return data.json(exclude_unset=True, ensure_ascii=False) - - -def create_app(chat_model: "ChatModel") -> "FastAPI": - app = FastAPI(lifespan=lifespan) - - app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - role_mapping = { - Role.USER: DataRole.USER.value, - Role.ASSISTANT: DataRole.ASSISTANT.value, - Role.SYSTEM: DataRole.SYSTEM.value, - Role.FUNCTION: DataRole.FUNCTION.value, - Role.TOOL: DataRole.OBSERVATION.value, - } - - @app.get("/v1/models", response_model=ModelList) - async def list_models(): - model_card = ModelCard(id="gpt-3.5-turbo") - return ModelList(data=[model_card]) - - @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, status_code=status.HTTP_200_OK) - async def create_chat_completion(request: ChatCompletionRequest): - if not chat_model.engine.can_generate: - raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed") - - if len(request.messages) == 0: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid length") - - if request.messages[0].role == Role.SYSTEM: - system = request.messages.pop(0).content - else: - system = "" - - if len(request.messages) % 2 == 0: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Only supports u/a/u/a/u...") - - input_messages = [] - for i, message in enumerate(request.messages): - if i % 2 == 0 and message.role not in [Role.USER, Role.TOOL]: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role") - elif i % 2 == 1 and message.role not in [Role.ASSISTANT, Role.FUNCTION]: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid role") - - input_messages.append({"role": role_mapping[message.role], "content": message.content}) - - tool_list = request.tools - if isinstance(tool_list, list) and len(tool_list): - try: - tools = json.dumps([tool["function"] for tool in tool_list], ensure_ascii=False) - except Exception: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid tools") - else: - tools = "" - - if request.stream: - if tools: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Cannot stream function calls.") - - generate = stream_chat_completion(input_messages, system, tools, request) - return EventSourceResponse(generate, media_type="text/event-stream") - - responses = await chat_model.achat( - input_messages, - system, - tools, - do_sample=request.do_sample, - temperature=request.temperature, - top_p=request.top_p, - max_new_tokens=request.max_tokens, - num_return_sequences=request.n, - ) - - prompt_length, response_length = 0, 0 - choices = [] - for i, response in enumerate(responses): - if tools: - result = chat_model.engine.template.format_tools.extract(response.response_text) - else: - result = response.response_text - - if isinstance(result, tuple): - name, arguments = result - function = Function(name=name, arguments=arguments) - response_message = ChatCompletionMessage( - role=Role.ASSISTANT, tool_calls=[FunctionCall(function=function)] - ) - finish_reason = Finish.TOOL - else: - response_message = ChatCompletionMessage(role=Role.ASSISTANT, content=result) - finish_reason = Finish.STOP if response.finish_reason == "stop" else Finish.LENGTH - - choices.append( - ChatCompletionResponseChoice(index=i, message=response_message, finish_reason=finish_reason) - ) - prompt_length = response.prompt_length - response_length += response.response_length - - usage = ChatCompletionResponseUsage( - prompt_tokens=prompt_length, - completion_tokens=response_length, - total_tokens=prompt_length + response_length, - ) - - return ChatCompletionResponse(model=request.model, choices=choices, usage=usage) - - async def stream_chat_completion( - messages: Sequence[Dict[str, str]], system: str, tools: str, request: ChatCompletionRequest - ): - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=ChatCompletionMessage(role=Role.ASSISTANT, content=""), finish_reason=None - ) - chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) - yield jsonify(chunk) - - async for new_token in chat_model.astream_chat( - messages, - system, - tools, - do_sample=request.do_sample, - temperature=request.temperature, - top_p=request.top_p, - max_new_tokens=request.max_tokens, - ): - if len(new_token) == 0: - continue - - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=ChatCompletionMessage(content=new_token), finish_reason=None - ) - chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) - yield jsonify(chunk) - - choice_data = ChatCompletionResponseStreamChoice( - index=0, delta=ChatCompletionMessage(), finish_reason=Finish.STOP - ) - chunk = ChatCompletionStreamResponse(model=request.model, choices=[choice_data]) - yield jsonify(chunk) - yield "[DONE]" - - @app.post("/v1/score/evaluation", response_model=ScoreEvaluationResponse, status_code=status.HTTP_200_OK) - async def create_score_evaluation(request: ScoreEvaluationRequest): - if chat_model.engine.can_generate: - raise HTTPException(status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Not allowed") - - if len(request.messages) == 0: - raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail="Invalid request") - - scores = await chat_model.aget_scores(request.messages, max_length=request.max_length) - return ScoreEvaluationResponse(model=request.model, scores=scores) - - return app - - -if __name__ == "__main__": - chat_model = ChatModel() - app = create_app(chat_model) - uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("API_PORT", 8000)), workers=1) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/vllm_engine.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/vllm_engine.py deleted file mode 100644 index 9911e36..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/chat/vllm_engine.py +++ /dev/null @@ -1,149 +0,0 @@ -import uuid -from typing import TYPE_CHECKING, AsyncGenerator, AsyncIterator, Dict, List, Optional, Sequence - -from transformers.utils.versions import require_version - -from ..data import get_template_and_fix_tokenizer -from ..extras.misc import get_device_count -from ..extras.packages import is_vllm_available -from ..model import load_tokenizer -from .base_engine import BaseEngine, Response - - -if is_vllm_available(): - from vllm import AsyncEngineArgs, AsyncLLMEngine, RequestOutput, SamplingParams - -if TYPE_CHECKING: - from ..hparams import DataArguments, FinetuningArguments, GeneratingArguments, ModelArguments - - -class VllmEngine(BaseEngine): - def __init__( - self, - model_args: "ModelArguments", - data_args: "DataArguments", - finetuning_args: "FinetuningArguments", - generating_args: "GeneratingArguments", - ) -> None: - require_version("vllm>=0.3.3", "To fix: pip install vllm>=0.3.3") - self.can_generate = finetuning_args.stage == "sft" - engine_args = AsyncEngineArgs( - model=model_args.model_name_or_path, - trust_remote_code=True, - max_model_len=model_args.vllm_maxlen, - tensor_parallel_size=get_device_count() or 1, - gpu_memory_utilization=model_args.vllm_gpu_util, - disable_log_stats=True, - disable_log_requests=True, - enforce_eager=model_args.vllm_enforce_eager, - ) - self.model = AsyncLLMEngine.from_engine_args(engine_args) - self.tokenizer = load_tokenizer(model_args) - self.tokenizer.padding_side = "left" - self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template) - self.generating_args = generating_args.to_dict() - - async def _generate( - self, - messages: Sequence[Dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - **input_kwargs, - ) -> AsyncIterator["RequestOutput"]: - request_id = "chatcmpl-{}".format(uuid.uuid4().hex) - paired_messages = messages + [{"role": "assistant", "content": ""}] - prompt_ids, _ = self.template.encode_oneturn( - tokenizer=self.tokenizer, messages=paired_messages, system=system, tools=tools - ) - prompt_length = len(prompt_ids) - - temperature = input_kwargs.pop("temperature", None) - top_p = input_kwargs.pop("top_p", None) - top_k = input_kwargs.pop("top_k", None) - num_return_sequences = input_kwargs.pop("num_return_sequences", None) - repetition_penalty = input_kwargs.pop("repetition_penalty", None) - max_length = input_kwargs.pop("max_length", None) - max_new_tokens = input_kwargs.pop("max_new_tokens", None) - - generating_args = self.generating_args.copy() - generating_args.update( - dict( - temperature=temperature or generating_args["temperature"], - top_p=top_p or generating_args["top_p"], - top_k=top_k or generating_args["top_k"], - num_return_sequences=num_return_sequences or 1, - repetition_penalty=repetition_penalty or generating_args["repetition_penalty"], - ) - ) - - if max_length: - generating_args["max_new_tokens"] = max_length - prompt_length - - if max_new_tokens: - generating_args["max_new_tokens"] = max_new_tokens - - sampling_params = SamplingParams( - n=generating_args["num_return_sequences"], - repetition_penalty=generating_args["repetition_penalty"], - temperature=generating_args["temperature"], - top_p=generating_args["top_p"], - top_k=generating_args["top_k"], - use_beam_search=generating_args["num_beams"] > 1, - length_penalty=generating_args["length_penalty"], - stop_token_ids=[self.tokenizer.eos_token_id] + self.tokenizer.additional_special_tokens_ids, - max_tokens=generating_args["max_new_tokens"], - skip_special_tokens=True, - ) - result_generator = self.model.generate( - prompt=None, sampling_params=sampling_params, request_id=request_id, prompt_token_ids=prompt_ids - ) - return result_generator - - async def start(self) -> None: - pass - - async def chat( - self, - messages: Sequence[Dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - **input_kwargs, - ) -> List["Response"]: - final_output = None - generator = await self._generate(messages, system, tools, **input_kwargs) - async for request_output in generator: - final_output = request_output - - results = [] - for output in final_output.outputs: - results.append( - Response( - response_text=output.text, - response_length=len(output.token_ids), - prompt_length=len(final_output.prompt_token_ids), - finish_reason=output.finish_reason, - ) - ) - - return results - - async def stream_chat( - self, - messages: Sequence[Dict[str, str]], - system: Optional[str] = None, - tools: Optional[str] = None, - **input_kwargs, - ) -> AsyncGenerator[str, None]: - generated_text = "" - generator = await self._generate(messages, system, tools, **input_kwargs) - async for result in generator: - delta_text = result.outputs[0].text[len(generated_text) :] - generated_text = result.outputs[0].text - yield delta_text - - async def get_scores( - self, - batch_input: List[str], - **input_kwargs, - ) -> List[float]: - raise NotImplementedError("vLLM engine does not support get_scores.") diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/__init__.py deleted file mode 100644 index 80dbf5f..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from .loader import get_dataset -from .template import Template, get_template_and_fix_tokenizer, templates -from .utils import Role, split_dataset - - -__all__ = ["get_dataset", "Template", "get_template_and_fix_tokenizer", "templates", "Role", "split_dataset"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/aligner.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/aligner.py deleted file mode 100644 index 4de37e6..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/aligner.py +++ /dev/null @@ -1,133 +0,0 @@ -from functools import partial -from typing import TYPE_CHECKING, Any, Dict, List, Union - -from datasets import Features - -from .utils import Role - - -if TYPE_CHECKING: - from datasets import Dataset, IterableDataset - - from ..hparams import DataArguments - from .parser import DatasetAttr - - -def convert_alpaca(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: - outputs = {"prompt": [], "response": [], "system": [], "tools": []} - for i in range(len(examples[dataset_attr.prompt])): - prompt = [] - if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list): - for old_prompt, old_response in examples[dataset_attr.history][i]: - prompt.append({"role": Role.USER.value, "content": old_prompt}) - prompt.append({"role": Role.ASSISTANT.value, "content": old_response}) - - content = [] - if dataset_attr.prompt and examples[dataset_attr.prompt][i]: - content.append(examples[dataset_attr.prompt][i]) - - if dataset_attr.query and examples[dataset_attr.query][i]: - content.append(examples[dataset_attr.query][i]) - - prompt.append({"role": Role.USER.value, "content": "\n".join(content)}) - - if dataset_attr.response and isinstance(examples[dataset_attr.response][i], list): - response = [ - {"role": Role.ASSISTANT.value, "content": content} for content in examples[dataset_attr.response][i] - ] - elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str): - response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}] - else: - response = [] - - outputs["prompt"].append(prompt) - outputs["response"].append(response) - outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "") - outputs["tools"].append("") - - return outputs - - -def convert_sharegpt(examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr") -> Dict[str, List[Any]]: - outputs = {"prompt": [], "response": [], "system": [], "tools": []} - tag_mapping = { - dataset_attr.user_tag: Role.USER.value, - dataset_attr.assistant_tag: Role.ASSISTANT.value, - dataset_attr.observation_tag: Role.OBSERVATION.value, - dataset_attr.function_tag: Role.FUNCTION.value, - dataset_attr.system_tag: Role.SYSTEM.value, - } - odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag) - even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag) - accept_tags = (odd_tags, even_tags) - for i, messages in enumerate(examples[dataset_attr.messages]): - if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag: - system = messages[0][dataset_attr.content_tag] - messages = messages[1:] - else: - system = examples[dataset_attr.system][i] if dataset_attr.system else "" - - messages = messages[: len(messages) // 2 * 2] # should be multiples of 2 - if len(messages) == 0: - continue - - aligned_messages = [] - for turn_idx, message in enumerate(messages): - if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]: - raise ValueError("Invalid role tag in {}.".format(messages)) - - aligned_messages.append( - {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]} - ) - - outputs["prompt"].append(aligned_messages[:-1]) - outputs["response"].append(aligned_messages[-1:]) - outputs["system"].append(system) - outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "") - - return outputs - - -def align_dataset( - dataset: Union["Dataset", "IterableDataset"], dataset_attr: "DatasetAttr", data_args: "DataArguments" -) -> Union["Dataset", "IterableDataset"]: - r""" - Aligned dataset: - prompt: [{"role": "user", "content": "..."}] * (2T - 1) - response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset) - system: "..." - tools: "..." - """ - if dataset_attr.formatting == "alpaca": - convert_func = partial(convert_alpaca, dataset_attr=dataset_attr) - else: - convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr) - - column_names = list(next(iter(dataset)).keys()) - features = Features.from_dict( - { - "prompt": [ - {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} - ], - "response": [ - {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}} - ], - "system": {"dtype": "string", "_type": "Value"}, - "tools": {"dtype": "string", "_type": "Value"}, - } - ) - kwargs = {} - if not data_args.streaming: - kwargs = dict( - num_proc=data_args.preprocessing_num_workers, - load_from_cache_file=(not data_args.overwrite_cache), - desc="Converting format of dataset", - ) - - return dataset.map( - convert_func, - batched=True, - remove_columns=column_names, - features=features, - **kwargs, - ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/preprocess.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/preprocess.py deleted file mode 100644 index 7fb0a9b..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/data/preprocess.py +++ /dev/null @@ -1,276 +0,0 @@ -from functools import partial -from itertools import chain -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Tuple - -from ..extras.constants import IGNORE_INDEX -from ..extras.logging import get_logger -from .utils import Role - - -if TYPE_CHECKING: - from transformers import Seq2SeqTrainingArguments - from transformers.tokenization_utils import PreTrainedTokenizer - - from ..hparams import DataArguments - from .template import Template - - -logger = get_logger(__name__) - - -def preprocess_pretrain_dataset( - examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" -) -> Dict[str, List[List[int]]]: - # build grouped texts with format `X1 X2 X3 ...` if packing is enabled - text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] - if not data_args.packing: - return tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len) - - tokenized_examples = tokenizer(text_examples, add_special_tokens=False) - concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} - total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) - block_size = data_args.cutoff_len - # we drop the small remainder, and if the total_length < block_size, we exclude this batch - total_length = (total_length // block_size) * block_size - # split by chunks of cutoff_len - result = { - k: [t[i : i + block_size] for i in range(0, total_length, block_size)] - for k, t in concatenated_examples.items() - } - if data_args.template == "gemma": - for i in range(len(result["input_ids"])): - result["input_ids"][i][0] = tokenizer.bos_token_id - - return result - - -def preprocess_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build inputs with format ` X Y ` and labels with format ` ... Y ` - # for multiturn examples, we only mask the prompt part in each prompt-response pair. - model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: - continue - - messages = examples["prompt"][i] + examples["response"][i] - input_ids, labels = [], [] - for turn_idx, (source_ids, target_ids) in enumerate( - template.encode_multiturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - ): - if data_args.train_on_prompt: - source_mask = source_ids - elif turn_idx != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) - else: - source_mask = [IGNORE_INDEX] * len(source_ids) - - input_ids += source_ids + target_ids - labels += source_mask + target_ids - - if template.efficient_eos: - input_ids += [tokenizer.eos_token_id] - labels += [tokenizer.eos_token_id] - - model_inputs["input_ids"].append(input_ids) - model_inputs["attention_mask"].append([1] * len(input_ids)) - model_inputs["labels"].append(labels) - - return model_inputs - - -def preprocess_packed_supervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build inputs with format ` X1 Y1 X2 Y2 ` - # and labels with format ` ... Y1 ... Y2 ` - model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - input_ids, labels = [], [] - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1: - continue - - messages = examples["prompt"][i] + examples["response"][i] - for source_ids, target_ids in template.encode_multiturn( - tokenizer, messages, examples["system"][i], examples["tools"][i] - ): - if data_args.train_on_prompt: - source_mask = source_ids - elif len(input_ids) != 0 and template.efficient_eos: - source_mask = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (len(source_ids) - 1) - else: - source_mask = [IGNORE_INDEX] * len(source_ids) - - input_ids += source_ids + target_ids - labels += source_mask + target_ids - - if template.efficient_eos: - input_ids += [tokenizer.eos_token_id] - labels += [tokenizer.eos_token_id] - - total_length = len(input_ids) - block_size = data_args.cutoff_len - # we drop the small remainder, and if the total_length < block_size, we exclude this batch - total_length = (total_length // block_size) * block_size - # split by chunks of cutoff_len - for i in range(0, total_length, block_size): - if not all(label == IGNORE_INDEX for label in labels[i : i + block_size]): - model_inputs["input_ids"].append(input_ids[i : i + block_size]) - model_inputs["attention_mask"].append([1] * block_size) - model_inputs["labels"].append(labels[i : i + block_size]) - - return model_inputs - - -def preprocess_unsupervised_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build inputs with format ` X` and labels with format `Y ` - model_inputs = {"input_ids": [], "attention_mask": [], "labels": []} - - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1: - continue - - if len(examples["response"][i]) == 1: - messages = examples["prompt"][i] + examples["response"][i] - else: - messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}] - - input_ids, labels = template.encode_oneturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - - if template.efficient_eos: - labels += [tokenizer.eos_token_id] - - model_inputs["input_ids"].append(input_ids) - model_inputs["attention_mask"].append([1] * len(input_ids)) - model_inputs["labels"].append(labels) - - return model_inputs - - -def preprocess_pairwise_dataset( - examples: Dict[str, List[Any]], - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", -) -> Dict[str, List[List[int]]]: - # build input pairs with format ` X`, `Y1 ` and `Y2 ` - model_inputs = {"prompt_ids": [], "chosen_ids": [], "rejected_ids": []} - for i in range(len(examples["prompt"])): - if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2: - continue - - chosen_messages = examples["prompt"][i] + [examples["response"][i][0]] - rejected_messages = examples["prompt"][i] + [examples["response"][i][1]] - prompt_ids, chosen_ids = template.encode_oneturn( - tokenizer, - chosen_messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - _, rejected_ids = template.encode_oneturn( - tokenizer, - rejected_messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - - if template.efficient_eos: - chosen_ids += [tokenizer.eos_token_id] - rejected_ids += [tokenizer.eos_token_id] - - model_inputs["prompt_ids"].append(prompt_ids) - model_inputs["chosen_ids"].append(chosen_ids) - model_inputs["rejected_ids"].append(rejected_ids) - - return model_inputs - - -def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: - print("input_ids:\n{}".format(example["input_ids"])) - print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) - print("label_ids:\n{}".format(example["labels"])) - print( - "labels:\n{}".format( - tokenizer.decode(list(filter(lambda x: x != IGNORE_INDEX, example["labels"])), skip_special_tokens=False) - ) - ) - - -def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: - print("prompt_ids:\n{}".format(example["prompt_ids"])) - print("prompt:\n{}".format(tokenizer.decode(example["prompt_ids"], skip_special_tokens=False))) - print("chosen_ids:\n{}".format(example["chosen_ids"])) - print("chosen:\n{}".format(tokenizer.decode(example["chosen_ids"], skip_special_tokens=False))) - print("rejected_ids:\n{}".format(example["rejected_ids"])) - print("rejected:\n{}".format(tokenizer.decode(example["rejected_ids"], skip_special_tokens=False))) - - -def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None: - print("input_ids:\n{}".format(example["input_ids"])) - print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False))) - - -def get_preprocess_and_print_func( - tokenizer: "PreTrainedTokenizer", - template: "Template", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - stage: Literal["pt", "sft", "rm", "ppo"], -) -> Tuple[Callable, Callable]: - if stage == "pt": - preprocess_func = partial(preprocess_pretrain_dataset, tokenizer=tokenizer, data_args=data_args) - print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) - elif stage == "sft" and not training_args.predict_with_generate: - if data_args.packing: - preprocess_func = partial( - preprocess_packed_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args - ) - else: - preprocess_func = partial( - preprocess_supervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args - ) - - print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer) - elif stage == "rm": - preprocess_func = partial( - preprocess_pairwise_dataset, tokenizer=tokenizer, template=template, data_args=data_args - ) - print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer) - else: - preprocess_func = partial( - preprocess_unsupervised_dataset, tokenizer=tokenizer, template=template, data_args=data_args - ) - print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer) - - return preprocess_func, print_function diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/__init__.py deleted file mode 100644 index 95ce037..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/eval/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .evaluator import Evaluator - - -__all__ = ["Evaluator"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/callbacks.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/callbacks.py deleted file mode 100644 index 086dea6..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/callbacks.py +++ /dev/null @@ -1,153 +0,0 @@ -import json -import os -import time -from datetime import timedelta -from typing import TYPE_CHECKING - -from transformers import TrainerCallback -from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length - -from .constants import LOG_FILE_NAME -from .logging import get_logger -from .misc import fix_valuehead_checkpoint - - -if TYPE_CHECKING: - from transformers import TrainerControl, TrainerState, TrainingArguments - - -logger = get_logger(__name__) - - -class FixValueHeadModelCallback(TrainerCallback): - def on_save(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called after a checkpoint save. - """ - if args.should_save: - fix_valuehead_checkpoint( - model=kwargs.pop("model"), - output_dir=os.path.join(args.output_dir, "{}-{}".format(PREFIX_CHECKPOINT_DIR, state.global_step)), - safe_serialization=args.save_safetensors, - ) - - -class LogCallback(TrainerCallback): - def __init__(self, runner=None): - self.runner = runner - self.in_training = False - self.start_time = time.time() - self.cur_steps = 0 - self.max_steps = 0 - self.elapsed_time = "" - self.remaining_time = "" - - def timing(self): - cur_time = time.time() - elapsed_time = cur_time - self.start_time - avg_time_per_step = elapsed_time / self.cur_steps if self.cur_steps != 0 else 0 - remaining_time = (self.max_steps - self.cur_steps) * avg_time_per_step - self.elapsed_time = str(timedelta(seconds=int(elapsed_time))) - self.remaining_time = str(timedelta(seconds=int(remaining_time))) - - def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called at the beginning of training. - """ - if state.is_local_process_zero: - self.in_training = True - self.start_time = time.time() - self.max_steps = state.max_steps - if os.path.exists(os.path.join(args.output_dir, LOG_FILE_NAME)) and args.overwrite_output_dir: - logger.warning("Previous log file in this folder will be deleted.") - os.remove(os.path.join(args.output_dir, LOG_FILE_NAME)) - - def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called at the end of training. - """ - if state.is_local_process_zero: - self.in_training = False - self.cur_steps = 0 - self.max_steps = 0 - - def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called at the end of an substep during gradient accumulation. - """ - if state.is_local_process_zero and self.runner is not None and self.runner.aborted: - control.should_epoch_stop = True - control.should_training_stop = True - - def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called at the end of a training step. - """ - if state.is_local_process_zero: - self.cur_steps = state.global_step - self.timing() - if self.runner is not None and self.runner.aborted: - control.should_epoch_stop = True - control.should_training_stop = True - - def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs): - r""" - Event called after an evaluation phase. - """ - if state.is_local_process_zero and not self.in_training: - self.cur_steps = 0 - self.max_steps = 0 - - def on_predict( - self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", *other, **kwargs - ): - r""" - Event called after a successful prediction. - """ - if state.is_local_process_zero and not self.in_training: - self.cur_steps = 0 - self.max_steps = 0 - - def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs) -> None: - r""" - Event called after logging the last logs. - """ - if not state.is_local_process_zero: - return - - logs = dict( - current_steps=self.cur_steps, - total_steps=self.max_steps, - loss=state.log_history[-1].get("loss", None), - eval_loss=state.log_history[-1].get("eval_loss", None), - predict_loss=state.log_history[-1].get("predict_loss", None), - reward=state.log_history[-1].get("reward", None), - learning_rate=state.log_history[-1].get("learning_rate", None), - epoch=state.log_history[-1].get("epoch", None), - percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100, - elapsed_time=self.elapsed_time, - remaining_time=self.remaining_time, - ) - if self.runner is not None: - logger.info( - "{{'loss': {:.4f}, 'learning_rate': {:2.4e}, 'epoch': {:.2f}}}".format( - logs["loss"] or 0, logs["learning_rate"] or 0, logs["epoch"] or 0 - ) - ) - - os.makedirs(args.output_dir, exist_ok=True) - with open(os.path.join(args.output_dir, "trainer_log.jsonl"), "a", encoding="utf-8") as f: - f.write(json.dumps(logs) + "\n") - - def on_prediction_step( - self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs - ): - r""" - Event called after a prediction step. - """ - eval_dataloader = kwargs.pop("eval_dataloader", None) - if state.is_local_process_zero and has_length(eval_dataloader) and not self.in_training: - if self.max_steps == 0: - self.max_steps = len(eval_dataloader) - self.cur_steps += 1 - self.timing() diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/logging.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/logging.py deleted file mode 100644 index bb27077..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/logging.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -import sys - - -class LoggerHandler(logging.Handler): - r""" - Logger handler used in Web UI. - """ - - def __init__(self): - super().__init__() - self.log = "" - - def reset(self): - self.log = "" - - def emit(self, record): - if record.name == "httpx": - return - log_entry = self.format(record) - self.log += log_entry - self.log += "\n\n" - - -def get_logger(name: str) -> logging.Logger: - r""" - Gets a standard logger with a stream hander to stdout. - """ - formatter = logging.Formatter( - fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S" - ) - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(formatter) - - logger = logging.getLogger(name) - logger.setLevel(logging.INFO) - logger.addHandler(handler) - - return logger - - -def reset_logging() -> None: - r""" - Removes basic config of root logger. (unused in script) - """ - root = logging.getLogger() - list(map(root.removeHandler, root.handlers)) - list(map(root.removeFilter, root.filters)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/llama_patch.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/llama_patch.py deleted file mode 100644 index fa43f76..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/llama_patch.py +++ /dev/null @@ -1,197 +0,0 @@ -import math -from typing import Optional, Tuple - -import torch -import torch.nn as nn -from transformers.models.llama.modeling_llama import ( - Cache, - LlamaAttention, - LlamaFlashAttention2, - apply_rotary_pos_emb, - repeat_kv, -) -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - - -# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py -def llama_torch_attn_forward( - self: "LlamaAttention", - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional["Cache"] = None, - output_attentions: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - if getattr(self.config, "group_size_ratio", None) and self.training: # shift - groupsz = int(q_len * getattr(self.config, "group_size_ratio")) - assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) - num_groups = q_len // groupsz - - def shift(state: torch.Tensor) -> torch.Tensor: - state = state.transpose(1, 2) # output: (bsz, seq_len, n_heads, head_dim) - state = torch.cat( - (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)), - dim=2, - ) - return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim).transpose(1, 2) - - query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) - if attention_mask is not None: - attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attention_mask is not None: - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) # (bsz, :, seq_len, :) or (bsz*n_group, :, groupsz, :) - attn_output = attn_output.transpose(1, 2).contiguous() - - if getattr(self.config, "group_size_ratio", None) and self.training: # shift back - attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) - attn_output = torch.cat( - ( - attn_output[:, :, : self.num_heads // 2], - attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1), - ) - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -# Modified from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py -def llama_flash_attn_forward( - self: "LlamaFlashAttention2", - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # LlamaFlashAttention2 attention does not support output_attentions - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # FlashAttention requires the input to have the shape (bsz, seq_len, n_heads, head_dim) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - query_states = query_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) - key_states = key_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) - value_states = value_states.transpose(1, 2) # (bsz, seq_len, n_heads, head_dim) - - dropout_rate = self.attention_dropout if self.training else 0.0 - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once("The input hidden states seems to be silently casted in float32.") - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - if getattr(self.config, "group_size_ratio", None) and self.training: # shift - groupsz = int(q_len * getattr(self.config, "group_size_ratio")) - assert q_len % groupsz == 0, "q_len {} should be divisible by group size {}.".format(q_len, groupsz) - num_groups = q_len // groupsz - - def shift(state: torch.Tensor) -> torch.Tensor: - state = torch.cat( - (state[:, :, : self.num_heads // 2], state[:, :, self.num_heads // 2 :].roll(-groupsz // 2, dims=1)), - dim=2, - ) - return state.reshape(bsz * num_groups, groupsz, self.num_heads, self.head_dim) - - query_states, key_states, value_states = shift(query_states), shift(key_states), shift(value_states) - if attention_mask is not None: - attention_mask = attention_mask[:, :, :groupsz, :groupsz].repeat(num_groups, 1, 1, 1) - - attn_output: torch.Tensor = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate - ) - - if getattr(self.config, "group_size_ratio", None) and self.training: # shift back - attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) - attn_output = torch.cat( - ( - attn_output[:, :, : self.num_heads // 2], - attn_output[:, :, self.num_heads // 2 :].roll(groupsz // 2, dims=1), - ) - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -def apply_llama_patch() -> None: - LlamaAttention.forward = llama_torch_attn_forward - LlamaFlashAttention2.forward = llama_flash_attn_forward diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/mixtral_patch.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/mixtral_patch.py deleted file mode 100644 index 382492e..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/extras/patches/mixtral_patch.py +++ /dev/null @@ -1,38 +0,0 @@ -import torch -import torch.nn.functional as F -from transformers.models.mixtral.modeling_mixtral import MixtralBLockSparseTop2MLP, MixtralSparseMoeBlock - - -def mlp_forward(self: "MixtralBLockSparseTop2MLP", hidden_states: torch.Tensor) -> torch.Tensor: - current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states) - current_hidden_states = self.w2(current_hidden_states) - return current_hidden_states - - -# Modified from: https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py -def moe_forward(self: "MixtralSparseMoeBlock", hidden_states: torch.Tensor) -> torch.Tensor: - batch_size, sequence_length, hidden_dim = hidden_states.shape - hidden_states = hidden_states.view(-1, hidden_dim) - # router_logits: (batch * sequence_length, n_experts) - router_logits = self.gate(hidden_states) - - routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) - topk_weight, topk_idx = torch.topk(routing_weights, self.top_k, dim=-1, sorted=False) - topk_weight /= topk_weight.sum(dim=-1, keepdim=True) - # we cast back to the input dtype - topk_weight = topk_weight.to(hidden_states.dtype) - - hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0) - y = torch.empty_like(hidden_states) - flat_topk_idx = topk_idx.view(-1) - for i in range(self.num_experts): - expert = self.experts[i] - y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i]) - y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1) - final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim) - return final_hidden_states, router_logits - - -def patch_mixtral_replace_moe_impl() -> None: - MixtralBLockSparseTop2MLP.forward = mlp_forward - MixtralSparseMoeBlock.forward = moe_forward diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/__init__.py deleted file mode 100644 index bb7c4db..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from .loader import load_model, load_model_and_tokenizer, load_tokenizer -from .utils import load_valuehead_params - - -__all__ = [ - "load_model", - "load_model_and_tokenizer", - "load_tokenizer", - "load_valuehead_params", -] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/adapter.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/adapter.py deleted file mode 100644 index 2f203b1..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/adapter.py +++ /dev/null @@ -1,162 +0,0 @@ -from typing import TYPE_CHECKING - -import torch -from peft import LoraConfig, LoraModel, PeftModel, TaskType, get_peft_model -from transformers.integrations import is_deepspeed_zero3_enabled - -from ..extras.logging import get_logger -from .utils import find_all_linear_modules, find_expanded_modules - - -if TYPE_CHECKING: - from transformers.modeling_utils import PreTrainedModel - - from ..hparams import FinetuningArguments, ModelArguments - - -logger = get_logger(__name__) - - -def init_adapter( - model: "PreTrainedModel", model_args: "ModelArguments", finetuning_args: "FinetuningArguments", is_trainable: bool -) -> "PreTrainedModel": - r""" - Initializes the adapters. - - Support full-parameter, freeze and LoRA training. - - Note that the trainable parameters must be cast to float32. - """ - - if (not is_trainable) and model_args.adapter_name_or_path is None: - logger.info("Adapter is not found at evaluation, load the base model.") - return model - - if finetuning_args.finetuning_type == "full" and is_trainable: - logger.info("Fine-tuning method: Full") - if not finetuning_args.pure_bf16: - model = model.float() - - if finetuning_args.finetuning_type == "freeze" and is_trainable: - logger.info("Fine-tuning method: Freeze") - num_layers = ( - getattr(model.config, "num_hidden_layers", None) - or getattr(model.config, "num_layers", None) - or getattr(model.config, "n_layer", None) - ) - if not num_layers: - raise ValueError("Current model does not support freeze tuning.") - - if finetuning_args.use_llama_pro: - if num_layers % finetuning_args.num_layer_trainable != 0: - raise ValueError( - "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format( - num_layers, finetuning_args.num_layer_trainable - ) - ) - - stride = num_layers // finetuning_args.num_layer_trainable - trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) - elif finetuning_args.num_layer_trainable > 0: # fine-tuning the last n layers if num_layer_trainable > 0 - trainable_layer_ids = range(num_layers - finetuning_args.num_layer_trainable, num_layers) - else: # fine-tuning the first n layers if num_layer_trainable < 0 - trainable_layer_ids = range(-finetuning_args.num_layer_trainable) - - freeze_modules = {"all"} - for name, _ in model.named_modules(): - if ".0." in name: - freeze_modules.add(name.split(".0.")[-1].split(".")[0]) - - trainable_layers = [] - for module_name in finetuning_args.name_module_trainable: - if module_name not in freeze_modules: - raise ValueError( - "Module {} is not found, please choose from {}".format(module_name, ", ".join(freeze_modules)) - ) - - for idx in trainable_layer_ids: - trainable_layers.append(".{:d}.{}".format(idx, module_name if module_name != "all" else "")) - - for name, param in model.named_parameters(): - if any(trainable_layer in name for trainable_layer in trainable_layers): - if not finetuning_args.pure_bf16: - param.data = param.data.to(torch.float32) - else: - param.requires_grad_(False) - - logger.info("Set trainable layers: {}".format(",".join(map(str, trainable_layer_ids)))) - - if finetuning_args.finetuning_type == "lora": - logger.info("Fine-tuning method: {}".format("DoRA" if finetuning_args.use_dora else "LoRA")) - adapter_to_resume = None - - if model_args.adapter_name_or_path is not None: - is_mergeable = True - if getattr(model, "quantization_method", None): # merge lora in quantized model is unstable - assert len(model_args.adapter_name_or_path) == 1, "Quantized model only accepts a single adapter." - is_mergeable = False - - if is_deepspeed_zero3_enabled(): - assert len(model_args.adapter_name_or_path) == 1, "Cannot use multiple adapters in DeepSpeed ZeRO-3." - is_mergeable = False - - if (is_trainable and not finetuning_args.create_new_adapter) or (not is_mergeable): - adapter_to_merge = model_args.adapter_name_or_path[:-1] - adapter_to_resume = model_args.adapter_name_or_path[-1] - else: - adapter_to_merge = model_args.adapter_name_or_path - - for adapter in adapter_to_merge: - model: "LoraModel" = PeftModel.from_pretrained(model, adapter) - model = model.merge_and_unload() - - if len(adapter_to_merge) > 0: - logger.info("Merged {} adapter(s).".format(len(adapter_to_merge))) - - if adapter_to_resume is not None: # resume lora training - model = PeftModel.from_pretrained(model, adapter_to_resume, is_trainable=is_trainable) - - if is_trainable and adapter_to_resume is None: # create new lora weights while training - if len(finetuning_args.lora_target) == 1 and finetuning_args.lora_target[0] == "all": - target_modules = find_all_linear_modules(model) - else: - target_modules = finetuning_args.lora_target - - if finetuning_args.use_llama_pro: - target_modules = find_expanded_modules(model, target_modules, finetuning_args.num_layer_trainable) - - if finetuning_args.use_dora: - if getattr(model, "quantization_method", None): - raise ValueError("DoRA is currently not compatible with quantized models.") - - peft_kwargs = { - "r": finetuning_args.lora_rank, - "target_modules": target_modules, - "lora_alpha": finetuning_args.lora_alpha, - "lora_dropout": finetuning_args.lora_dropout, - "use_rslora": finetuning_args.use_rslora, - } - - if model_args.use_unsloth: - from unsloth import FastLanguageModel # type: ignore - - unsloth_peft_kwargs = {"model": model, "max_seq_length": model_args.model_max_length} - model = FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs) - else: - lora_config = LoraConfig( - task_type=TaskType.CAUSAL_LM, - inference_mode=False, - modules_to_save=finetuning_args.additional_target, - use_dora=finetuning_args.use_dora, - **peft_kwargs, - ) - model = get_peft_model(model, lora_config) - - if not finetuning_args.pure_bf16: - for param in filter(lambda p: p.requires_grad, model.parameters()): - param.data = param.data.to(torch.float32) - - if model_args.adapter_name_or_path is not None: - logger.info("Loaded adapter(s): {}".format(",".join(model_args.adapter_name_or_path))) - - return model diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/loader.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/loader.py deleted file mode 100644 index 0f886c3..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/loader.py +++ /dev/null @@ -1,151 +0,0 @@ -from typing import TYPE_CHECKING, Any, Dict, Tuple - -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer -from trl import AutoModelForCausalLMWithValueHead - -from ..extras.logging import get_logger -from ..extras.misc import count_parameters, get_current_device, try_download_model_from_ms -from .adapter import init_adapter -from .patcher import patch_config, patch_model, patch_tokenizer, patch_valuehead_model -from .utils import load_valuehead_params, register_autoclass - - -if TYPE_CHECKING: - from transformers import PreTrainedModel, PreTrainedTokenizer - - from ..hparams import FinetuningArguments, ModelArguments - - -logger = get_logger(__name__) - - -def _get_init_kwargs(model_args: "ModelArguments") -> Dict[str, Any]: - return { - "trust_remote_code": True, - "cache_dir": model_args.cache_dir, - "revision": model_args.model_revision, - "token": model_args.hf_hub_token, - } - - -def load_tokenizer(model_args: "ModelArguments") -> "PreTrainedTokenizer": - r""" - Loads pretrained tokenizer. Must before load_model. - - Note: including inplace operation of model_args. - """ - try_download_model_from_ms(model_args) - init_kwargs = _get_init_kwargs(model_args) - - tokenizer = AutoTokenizer.from_pretrained( - model_args.model_name_or_path, - use_fast=model_args.use_fast_tokenizer, - split_special_tokens=model_args.split_special_tokens, - padding_side="right", - **init_kwargs, - ) - patch_tokenizer(tokenizer) - return tokenizer - - -def load_model( - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, -) -> "PreTrainedModel": - r""" - Loads pretrained model. Must after load_tokenizer. - """ - init_kwargs = _get_init_kwargs(model_args) - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs) - patch_config(config, tokenizer, model_args, init_kwargs, is_trainable) - - model = None - if is_trainable and model_args.use_unsloth: - from unsloth import FastLanguageModel # type: ignore - - unsloth_kwargs = { - "model_name": model_args.model_name_or_path, - "max_seq_length": model_args.model_max_length, - "dtype": model_args.compute_dtype, - "load_in_4bit": model_args.quantization_bit == 4, - "token": model_args.hf_hub_token, - "device_map": {"": get_current_device()}, - "rope_scaling": getattr(config, "rope_scaling", None), - } - try: - model, _ = FastLanguageModel.from_pretrained(**unsloth_kwargs) - except NotImplementedError: - logger.warning("Unsloth does not support model type {}.".format(getattr(config, "model_type", None))) - model_args.use_unsloth = False - - if model_args.adapter_name_or_path: - model_args.adapter_name_or_path = None - logger.warning("Unsloth does not support loading adapters.") - - if model is None: - model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config, **init_kwargs) - - patch_model(model, tokenizer, model_args, is_trainable) - register_autoclass(config, model, tokenizer) - - model = init_adapter(model, model_args, finetuning_args, is_trainable) - - if add_valuehead: - model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(model) - patch_valuehead_model(model) - - if model_args.adapter_name_or_path is not None: - vhead_path = model_args.adapter_name_or_path[-1] - else: - vhead_path = model_args.model_name_or_path - - vhead_params = load_valuehead_params(vhead_path, model_args) - if vhead_params is not None: - model.load_state_dict(vhead_params, strict=False) - logger.info("Loaded valuehead from checkpoint: {}".format(vhead_path)) - - if not is_trainable: - model.requires_grad_(False) - if not getattr(model, "quantization_method", None): - for param in filter(lambda p: p.device.type == "cuda", model.parameters()): - param.data = param.data.to(model_args.compute_dtype) - - model.eval() - else: - model.train() - - trainable_params, all_param = count_parameters(model) - if is_trainable: - param_stats = "trainable params: {:d} || all params: {:d} || trainable%: {:.4f}".format( - trainable_params, all_param, 100 * trainable_params / all_param - ) - else: - param_stats = "all params: {:d}".format(all_param) - logger.info(param_stats) - - if model_args.print_param_status: - for name, param in model.named_parameters(): - print( - "name: {}, dtype: {}, device: {}, trainable: {}".format( - name, param.dtype, param.device, param.requires_grad - ) - ) - - return model - - -def load_model_and_tokenizer( - model_args: "ModelArguments", - finetuning_args: "FinetuningArguments", - is_trainable: bool = False, - add_valuehead: bool = False, -) -> Tuple["PreTrainedModel", "PreTrainedTokenizer"]: - r""" - Loads pretrained model and tokenizer. - """ - tokenizer = load_tokenizer(model_args) - model = load_model(tokenizer, model_args, finetuning_args, is_trainable, add_valuehead) - return model, tokenizer diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/patcher.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/patcher.py deleted file mode 100644 index 0d8b9d7..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/patcher.py +++ /dev/null @@ -1,352 +0,0 @@ -import math -import os -import random -from contextlib import nullcontext -from types import MethodType -from typing import TYPE_CHECKING, Any, Dict, List, Tuple - -import torch -from datasets import load_dataset -from peft import PeftModel -from transformers import BitsAndBytesConfig, GPTQConfig, PreTrainedModel, PreTrainedTokenizerBase -from transformers.integrations import is_deepspeed_zero3_enabled -from transformers.utils.versions import require_version - -from ..extras.constants import FILEEXT2TYPE, LAYERNORM_NAMES -from ..extras.logging import get_logger -from ..extras.misc import get_current_device, infer_optim_dtype -from ..extras.packages import is_flash_attn2_available -from ..extras.patches.llama_patch import apply_llama_patch -from ..extras.patches.mixtral_patch import patch_mixtral_replace_moe_impl - - -if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedTokenizer - from trl import AutoModelForCausalLMWithValueHead - - from ..hparams import ModelArguments - - -logger = get_logger(__name__) -SUPPORTED_CLASS_FOR_S2ATTN = ["llama"] - - -def _noisy_mean_initialization(embed_weight: torch.Tensor, num_new_tokens: int): - embedding_dim = embed_weight.size(1) - avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) - noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) - noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) - embed_weight[-num_new_tokens:] = avg_weight + noise_weight - - -def _resize_embedding_layer(model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer") -> None: - r""" - Resize token embeddings. - """ - if is_deepspeed_zero3_enabled(): - import deepspeed # type: ignore - - params = [model.get_input_embeddings().weight] - if model.get_output_embeddings() is not None and not model.config.tie_word_embeddings: - params.append(model.get_output_embeddings().weight) - - context_maybe_zero3 = deepspeed.zero.GatheredParameters(params, modifier_rank=0) - else: - context_maybe_zero3 = nullcontext() - - with context_maybe_zero3: - current_embedding_size = model.get_input_embeddings().weight.size(0) - - if len(tokenizer) > current_embedding_size: - if not isinstance(model.get_output_embeddings(), torch.nn.Linear): - logger.warning("Current model does not support resizing token embeddings.") - return - - model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64) - with context_maybe_zero3: - new_embedding_size = model.get_input_embeddings().weight.size(0) - num_new_tokens = new_embedding_size - current_embedding_size - _noisy_mean_initialization(model.get_input_embeddings().weight.data, num_new_tokens) - _noisy_mean_initialization(model.get_output_embeddings().weight.data, num_new_tokens) - - logger.info("Resized token embeddings from {} to {}.".format(current_embedding_size, new_embedding_size)) - - -def _get_quantization_dataset(tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments") -> List[str]: - r""" - Inspired by: https://github.com/huggingface/optimum/blob/v1.16.0/optimum/gptq/data.py#L133 - TODO: remove tokenizer.decode() https://github.com/huggingface/optimum/pull/1600 - """ - if os.path.isfile(model_args.export_quantization_dataset): - data_path = FILEEXT2TYPE.get(model_args.export_quantization_dataset.split(".")[-1], None) - data_files = model_args.export_quantization_dataset - else: - data_path = model_args.export_quantization_dataset - data_files = None - - dataset = load_dataset(path=data_path, data_files=data_files, split="train", cache_dir=model_args.cache_dir) - maxlen = model_args.export_quantization_maxlen - - samples = [] - for _ in range(model_args.export_quantization_nsamples): - while True: - sample_idx = random.randint(0, len(dataset) - 1) - sample: Dict[str, torch.Tensor] = tokenizer(dataset[sample_idx]["text"], return_tensors="pt") - if sample["input_ids"].size(1) >= maxlen: - break # TODO: fix large maxlen - - word_idx = random.randint(0, sample["input_ids"].size(1) - maxlen - 1) - input_ids = sample["input_ids"][:, word_idx : word_idx + maxlen] - samples.append(tokenizer.decode(input_ids[0].tolist(), skip_special_tokens=True)) - - return samples - - -def _configure_attn_implementation(model_args: "ModelArguments", init_kwargs: Dict[str, Any]) -> None: - if model_args.flash_attn: - if is_flash_attn2_available(): - logger.info("Using FlashAttention-2 for faster training and inference.") - init_kwargs["attn_implementation"] = "flash_attention_2" - else: - logger.warning("FlashAttention2 is not installed.") - init_kwargs["attn_implementation"] = None - else: - init_kwargs["attn_implementation"] = "eager" - - -def _configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: - if model_args.rope_scaling is None: - return - - if not hasattr(config, "rope_scaling"): - logger.warning("Current model does not support RoPE scaling.") - return - - if is_trainable: - if model_args.rope_scaling == "dynamic": - logger.warning( - "Dynamic NTK scaling may not work well with fine-tuning. " - "See: https://github.com/huggingface/transformers/pull/24653" - ) - - current_max_length = getattr(config, "max_position_embeddings", None) - if current_max_length and model_args.model_max_length > current_max_length: - scaling_factor = float(math.ceil(model_args.model_max_length / current_max_length)) - else: - logger.warning("Input length is smaller than max length. Consider increase input length.") - scaling_factor = 1.0 - else: - scaling_factor = 2.0 - - setattr(config, "rope_scaling", {"type": model_args.rope_scaling, "factor": scaling_factor}) - logger.info( - "Using {} scaling strategy and setting scaling factor to {}".format(model_args.rope_scaling, scaling_factor) - ) - - -def _configure_longlora(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: - if not is_trainable or not model_args.shift_attn: - return - - if getattr(config, "model_type", None) in SUPPORTED_CLASS_FOR_S2ATTN: - setattr(config, "group_size_ratio", 0.25) - apply_llama_patch() - logger.info("Using shift short attention with group_size_ratio=1/4.") - else: - logger.warning("Current model does not support shift short attention.") - - -def _configure_quantization( - config: "PretrainedConfig", - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - init_kwargs: Dict[str, Any], -) -> None: - r""" - Priority: PTQ-quantized (training) > AutoGPTQ (export) > Bitsandbytes (training) - """ - if getattr(config, "quantization_config", None): # ptq - if is_deepspeed_zero3_enabled(): - raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantization.") - - init_kwargs["device_map"] = {"": get_current_device()} - quantization_config: Dict[str, Any] = getattr(config, "quantization_config", None) - quant_method = quantization_config.get("quant_method", "") - - if quant_method == "gptq": - quantization_config["use_exllama"] = False # disable exllama - - if quant_method == "aqlm": - require_version( - "transformers>=4.39.0.dev0", "To fix: pip install git+https://github.com/huggingface/transformers.git" - ) - require_version("aqlm>=1.1.0", "To fix: pip install aqlm[gpu]>=1.1.0") - quantization_config["bits"] = 2 - - quant_bits = quantization_config.get("bits", "?") - logger.info("Loading {}-bit {}-quantized model.".format(quant_bits, quant_method.upper())) - - elif model_args.export_quantization_bit is not None: # auto-gptq - require_version("optimum>=1.16.0", "To fix: pip install optimum>=1.16.0") - require_version("auto_gptq>=0.5.0", "To fix: pip install auto_gptq>=0.5.0") - from accelerate.utils import get_max_memory - - if getattr(config, "model_type", None) == "chatglm": - raise ValueError("ChatGLM model is not supported.") - - init_kwargs["quantization_config"] = GPTQConfig( - bits=model_args.export_quantization_bit, - tokenizer=tokenizer, - dataset=_get_quantization_dataset(tokenizer, model_args), - ) - init_kwargs["device_map"] = "auto" - init_kwargs["max_memory"] = get_max_memory() - logger.info("Quantizing model to {} bit.".format(model_args.export_quantization_bit)) - - elif model_args.quantization_bit is not None: # bnb - if is_deepspeed_zero3_enabled(): - raise ValueError("DeepSpeed ZeRO-3 is incompatible with quantization.") - - if model_args.quantization_bit == 8: - require_version("bitsandbytes>=0.37.0", "To fix: pip install bitsandbytes>=0.37.0") - init_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True) - - elif model_args.quantization_bit == 4: - require_version("bitsandbytes>=0.39.0", "To fix: pip install bitsandbytes>=0.39.0") - init_kwargs["quantization_config"] = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=model_args.compute_dtype, - bnb_4bit_use_double_quant=model_args.double_quantization, - bnb_4bit_quant_type=model_args.quantization_type, - ) - - init_kwargs["device_map"] = {"": get_current_device()} - logger.info("Quantizing model to {} bit.".format(model_args.quantization_bit)) - - -def _prepare_model_for_training( - model: "PreTrainedModel", model_args: "ModelArguments", output_layer_name: str = "lm_head" -) -> None: - r""" - Includes: - (1) cast the layernorm in fp32 - (2) make output embedding layer require grads - (3) add the upcasting of the lm_head in fp32 - Inspired by: https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/other.py#L72 - """ - if model_args.upcast_layernorm: - logger.info("Upcasting layernorm weights in float32.") - for name, param in model.named_parameters(): - if param.ndim == 1 and any(ln_name in name for ln_name in LAYERNORM_NAMES): - param.data = param.data.to(torch.float32) - - if not model_args.disable_gradient_checkpointing: - if not getattr(model, "supports_gradient_checkpointing", False): - logger.warning("Current model does not support gradient checkpointing.") - else: - # use_reentrant=False might increase VRAM usage (have not been empirically verified yet) - # According to: https://github.com/huggingface/transformers/issues/28339 - model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True}) - model.enable_input_require_grads() - setattr(model.config, "use_cache", False) # turn off when gradient checkpointing is enabled - logger.info("Gradient checkpointing enabled.") - - if hasattr(model, output_layer_name) and model_args.upcast_lmhead_output: - - def fp32_forward_post_hook(module: torch.nn.Module, args: Tuple[torch.Tensor], output: torch.Tensor): - return output.to(torch.float32) - - logger.info("Upcasting lm_head outputs in float32.") - output_layer = getattr(model, output_layer_name) - if isinstance(output_layer, torch.nn.Linear) and output_layer.weight.dtype != torch.float32: - output_layer.register_forward_hook(fp32_forward_post_hook) - - -def patch_tokenizer(tokenizer: "PreTrainedTokenizer") -> None: - if "PreTrainedTokenizerBase" not in str(tokenizer._pad.__func__): - tokenizer._pad = MethodType(PreTrainedTokenizerBase._pad, tokenizer) - - -def patch_config( - config: "PretrainedConfig", - tokenizer: "PreTrainedTokenizer", - model_args: "ModelArguments", - init_kwargs: Dict[str, Any], - is_trainable: bool, -) -> None: - if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32 - model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) - - if getattr(config, "model_type", None) == "qwen": - for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]: - setattr(config, dtype_name, model_args.compute_dtype == dtype) - - _configure_attn_implementation(model_args, init_kwargs) - _configure_rope(config, model_args, is_trainable) - _configure_longlora(config, model_args, is_trainable) - _configure_quantization(config, tokenizer, model_args, init_kwargs) - - if model_args.use_cache and not is_trainable: - setattr(config, "use_cache", True) - logger.info("Using KV cache for faster generation.") - - init_kwargs["torch_dtype"] = model_args.compute_dtype - if not is_deepspeed_zero3_enabled(): - init_kwargs["low_cpu_mem_usage"] = model_args.low_cpu_mem_usage - if "device_map" not in init_kwargs: # quant models cannot use auto device map - init_kwargs["device_map"] = model_args.device_map or {"": get_current_device()} - - if init_kwargs["device_map"] == "auto": - init_kwargs["offload_folder"] = model_args.offload_folder - - -def patch_model( - model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer", model_args: "ModelArguments", is_trainable: bool -) -> None: - if "GenerationMixin" not in str(model.generate.__func__): - model.generate = MethodType(PreTrainedModel.generate, model) - - if getattr(model.config, "model_type", None) == "chatglm": - setattr(model, "lm_head", model.transformer.output_layer) - setattr(model, "_keys_to_ignore_on_save", ["lm_head.weight"]) - - if model_args.resize_vocab: - _resize_embedding_layer(model, tokenizer) - - if is_trainable: - _prepare_model_for_training(model, model_args) - - if getattr(model.config, "model_type", None) == "mixtral" and is_deepspeed_zero3_enabled(): - require_version("deepspeed>=0.13.0", "To fix: pip install deepspeed>=0.13.0") - from deepspeed.utils import set_z3_leaf_modules # type: ignore - from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock - - set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) - - if is_trainable: - patch_mixtral_replace_moe_impl() - - try: - model.add_model_tags(["llama-factory"]) - except Exception: - logger.warning("Cannot properly tag the model.") - - -def patch_valuehead_model(model: "AutoModelForCausalLMWithValueHead") -> None: - def tie_weights(self: "AutoModelForCausalLMWithValueHead") -> None: - if isinstance(self.pretrained_model, PreTrainedModel): - self.pretrained_model.tie_weights() - - def get_input_embeddings(self: "AutoModelForCausalLMWithValueHead") -> torch.nn.Module: - if isinstance(self.pretrained_model, PreTrainedModel): - return self.pretrained_model.get_input_embeddings() - - def create_or_update_model_card(self: "AutoModelForCausalLMWithValueHead", output_dir: str) -> None: - if isinstance(self.pretrained_model, PeftModel): - self.pretrained_model.create_or_update_model_card(output_dir) - - ignore_modules = [name for name, _ in model.named_parameters() if "pretrained_model" in name] - setattr(model, "_keys_to_ignore_on_save", ignore_modules) - setattr(model, "tie_weights", MethodType(tie_weights, model)) - setattr(model, "get_input_embeddings", MethodType(get_input_embeddings, model)) - setattr(model, "create_or_update_model_card", MethodType(create_or_update_model_card, model)) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/utils.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/utils.py deleted file mode 100644 index 4a4ecf2..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/model/utils.py +++ /dev/null @@ -1,108 +0,0 @@ -from typing import TYPE_CHECKING, Dict, List - -import torch -from transformers import PreTrainedModel -from transformers.utils import cached_file - -from ..extras.constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME -from ..extras.logging import get_logger - - -if TYPE_CHECKING: - from transformers import PretrainedConfig, PreTrainedTokenizer - - from ..hparams import ModelArguments - - -logger = get_logger(__name__) - - -def find_all_linear_modules(model: "PreTrainedModel") -> List[str]: - r""" - Finds all available modules to apply lora. - """ - quantization_method = getattr(model, "quantization_method", None) - if quantization_method is None: - linear_cls = torch.nn.Linear - elif quantization_method == "bitsandbytes": - import bitsandbytes as bnb - - linear_cls = bnb.nn.Linear4bit if getattr(model, "is_loaded_in_4bit", False) else bnb.nn.Linear8bitLt - else: - raise ValueError("Finding linear modules for {} models is not supported.".format(quantization_method)) - - output_layer_names = ["lm_head"] - if model.config.model_type == "chatglm": - output_layer_names.append("output_layer") - - module_names = set() - for name, module in model.named_modules(): - if isinstance(module, linear_cls) and not any(output_layer in name for output_layer in output_layer_names): - module_names.add(name.split(".")[-1]) - - logger.info("Found linear modules: {}".format(",".join(module_names))) - return list(module_names) - - -def find_expanded_modules(model: "PreTrainedModel", target_modules: List[str], num_layer_trainable: int) -> List[str]: - r""" - Finds the modules in the expanded blocks to apply lora. - """ - num_layers = getattr(model.config, "num_hidden_layers", None) - if not num_layers: - raise ValueError("Model was not supported.") - - if num_layers % num_layer_trainable != 0: - raise ValueError( - "`num_layers` {} should be divisible by `num_layer_trainable` {}.".format(num_layers, num_layer_trainable) - ) - - stride = num_layers // num_layer_trainable - trainable_layer_ids = range(stride - 1, num_layers + stride - 1, stride) - trainable_layers = [".{:d}.".format(idx) for idx in trainable_layer_ids] - module_names = [] - for name, _ in model.named_modules(): - if any(target_module in name for target_module in target_modules) and any( - trainable_layer in name for trainable_layer in trainable_layers - ): - module_names.append(name) - - logger.info("Apply lora to layers: {}".format(",".join(map(str, trainable_layer_ids)))) - return module_names - - -def load_valuehead_params(path_or_repo_id: str, model_args: "ModelArguments") -> Dict[str, torch.Tensor]: - r""" - Loads value head parameters from Hugging Face Hub or local disk. - - Returns: dict with keys `v_head.summary.weight` and `v_head.summary.bias`. - """ - kwargs = {"path_or_repo_id": path_or_repo_id, "cache_dir": model_args.cache_dir, "token": model_args.hf_hub_token} - - try: - from safetensors import safe_open - - vhead_file = cached_file(filename=V_HEAD_SAFE_WEIGHTS_NAME, **kwargs) - with safe_open(vhead_file, framework="pt", device="cpu") as f: - return {key: f.get_tensor(key) for key in f.keys()} - except Exception as err: - logger.info("Failed to load {}: {}".format(V_HEAD_SAFE_WEIGHTS_NAME, str(err))) - - try: - vhead_file = cached_file(filename=V_HEAD_WEIGHTS_NAME, **kwargs) - return torch.load(vhead_file, map_location="cpu") - except Exception as err: - logger.info("Failed to load {}: {}".format(V_HEAD_WEIGHTS_NAME, str(err))) - - logger.info("Provided path ({}) does not contain value head weights.".format(path_or_repo_id)) - logger.info("Ignore these messages if you are not resuming the training of a value head model.") - return None - - -def register_autoclass(config: "PretrainedConfig", model: "PreTrainedModel", tokenizer: "PreTrainedTokenizer"): - if "AutoConfig" in getattr(config, "auto_map", {}): - config.__class__.register_for_auto_class() - if "AutoModelForCausalLM" in getattr(config, "auto_map", {}): - model.__class__.register_for_auto_class() - if "AutoTokenizer" in tokenizer.init_kwargs.get("auto_map", {}): - tokenizer.__class__.register_for_auto_class() diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/__init__.py deleted file mode 100644 index 6c22bc1..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .tuner import export_model, run_exp - - -__all__ = ["export_model", "run_exp"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/collator.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/collator.py deleted file mode 100644 index 7e8ba1c..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/collator.py +++ /dev/null @@ -1,54 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, List, Sequence, Tuple - -import torch -from transformers import DataCollatorForSeq2Seq - - -@dataclass -class DPODataCollatorWithPadding(DataCollatorForSeq2Seq): - r""" - Data collator for pairwise data. - """ - - def _pad_labels(self, batch: torch.Tensor, positions: List[Tuple[int, int]]) -> torch.Tensor: - padded_labels = [] - for feature, (prompt_len, answer_len) in zip(batch, positions): - if self.tokenizer.padding_side == "left": - start, end = feature.size(0) - answer_len, feature.size(0) - else: - start, end = prompt_len, prompt_len + answer_len - padded_tensor = self.label_pad_token_id * torch.ones_like(feature) - padded_tensor[start:end] = feature[start:end] - padded_labels.append(padded_tensor) - return torch.stack(padded_labels, dim=0).contiguous() # in contiguous memory - - def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: - r""" - Pads batched data to the longest sequence in the batch. - - We generate 2 * n examples where the first n examples represent chosen examples and - the last n examples represent rejected examples. - """ - concatenated_features = [] - label_positions = [] - for key in ("chosen_ids", "rejected_ids"): - for feature in features: - prompt_len, answer_len = len(feature["prompt_ids"]), len(feature[key]) - concatenated_features.append( - { - "input_ids": feature["prompt_ids"] + feature[key], - "attention_mask": [1] * (prompt_len + answer_len), - } - ) - label_positions.append((prompt_len, answer_len)) - - batch = self.tokenizer.pad( - concatenated_features, - padding=self.padding, - max_length=self.max_length, - pad_to_multiple_of=self.pad_to_multiple_of, - return_tensors=self.return_tensors, - ) - batch["labels"] = self._pad_labels(batch["input_ids"], label_positions) - return batch diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/trainer.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/trainer.py deleted file mode 100644 index ed8bf4c..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/dpo/trainer.py +++ /dev/null @@ -1,149 +0,0 @@ -from collections import defaultdict -from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, Literal, Optional, Tuple, Union - -import torch -from transformers import BatchEncoding, Trainer -from trl import DPOTrainer -from trl.trainer.utils import disable_dropout_in_model - -from ...extras.constants import IGNORE_INDEX - - -if TYPE_CHECKING: - from transformers import PreTrainedModel - - -class CustomDPOTrainer(DPOTrainer): - def __init__( - self, - beta: float, - loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair"], - ftx_gamma: float, - model: Union["PreTrainedModel", torch.nn.Module], - ref_model: Optional[Union["PreTrainedModel", torch.nn.Module]] = None, - disable_dropout: bool = True, - **kwargs, - ): - if disable_dropout: - disable_dropout_in_model(model) - if ref_model is not None: - disable_dropout_in_model(ref_model) - - self.reference_free = False - self.use_dpo_data_collator = True # hack to avoid warning - self.generate_during_eval = False # disable at evaluation - self.label_pad_token_id = IGNORE_INDEX - self.padding_value = 0 - self.is_encoder_decoder = model.config.is_encoder_decoder - self.precompute_ref_log_probs = False - self._precomputed_train_ref_log_probs = False - self._precomputed_eval_ref_log_probs = False - self._peft_has_been_casted_to_bf16 = False - - self.ref_model = ref_model - self.beta = beta - self.label_smoothing = 0 - self.loss_type = loss_type - self.ftx_gamma = ftx_gamma - self._stored_metrics = defaultdict(lambda: defaultdict(list)) - - Trainer.__init__(self, model=model, **kwargs) - if not hasattr(self, "accelerator"): - raise AttributeError("Please update `transformers`.") - - if ref_model is not None: - if self.is_deepspeed_enabled: - if not ( - getattr(ref_model, "is_loaded_in_8bit", False) or getattr(ref_model, "is_loaded_in_4bit", False) - ): # quantized models are already set on the correct device - self.ref_model = self._prepare_deepspeed(self.ref_model) - else: - self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True) - - def sft_loss(self, chosen_logits: torch.FloatTensor, chosen_labels: torch.LongTensor) -> torch.Tensor: - r""" - Computes supervised cross-entropy loss of given labels under the given logits. - - Returns: - A tensor of shape (batch_size,) containing the cross-entropy loss of each samples. - """ - all_logps = self.get_batch_logps(chosen_logits, chosen_labels, average_log_prob=True) - return -all_logps - - def concatenated_forward( - self, model: "PreTrainedModel", batch: Dict[str, torch.Tensor] - ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]: - batch_copied = BatchEncoding({k: v.detach().clone() for k, v in batch.items()}) # avoid error - - all_logits = model( - input_ids=batch_copied["input_ids"], attention_mask=batch_copied["attention_mask"], return_dict=True - ).logits.to(torch.float32) - - all_logps = self.get_batch_logps( - all_logits, - batch["labels"], - average_log_prob=False, - label_pad_token_id=self.label_pad_token_id, - ) - batch_size = batch["input_ids"].size(0) // 2 - chosen_logps, rejected_logps = all_logps.split(batch_size, dim=0) - chosen_logits, rejected_logits = all_logits.split(batch_size, dim=0) - return chosen_logps, rejected_logps, chosen_logits, rejected_logits - - def get_batch_loss_metrics( - self, - model: "PreTrainedModel", - batch: Dict[str, torch.Tensor], - train_eval: Literal["train", "eval"] = "train", - ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: - r""" - Computes the DPO loss and other metrics for the given batch of inputs for train or test. - """ - metrics = {} - ( - policy_chosen_logps, - policy_rejected_logps, - policy_chosen_logits, - policy_rejected_logits, - ) = self.concatenated_forward(model, batch) - with torch.no_grad(): - if self.ref_model is None: - ref_model = self.model - ref_context = self.accelerator.unwrap_model(self.model).disable_adapter() - else: - ref_model = self.ref_model - ref_context = nullcontext() - - with ref_context: - ( - reference_chosen_logps, - reference_rejected_logps, - _, - _, - ) = self.concatenated_forward(ref_model, batch) - - losses, chosen_rewards, rejected_rewards = self.dpo_loss( - policy_chosen_logps, - policy_rejected_logps, - reference_chosen_logps, - reference_rejected_logps, - ) - if self.ftx_gamma > 1e-6: - batch_size = batch["input_ids"].size(0) // 2 - chosen_labels, _ = batch["labels"].split(batch_size, dim=0) - losses += self.ftx_gamma * self.sft_loss(policy_chosen_logits, chosen_labels) - - reward_accuracies = (chosen_rewards > rejected_rewards).float() - - prefix = "eval_" if train_eval == "eval" else "" - metrics[f"{prefix}rewards/chosen"] = chosen_rewards.cpu().mean() - metrics[f"{prefix}rewards/rejected"] = rejected_rewards.cpu().mean() - metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.cpu().mean() - metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).cpu().mean() - metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().cpu().mean() - metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().cpu().mean() - metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().cpu().mean() - metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().cpu().mean() - - return losses.mean(), metrics diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/collator.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/collator.py deleted file mode 100644 index 8d5d4ad..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/rm/collator.py +++ /dev/null @@ -1,29 +0,0 @@ -from dataclasses import dataclass -from typing import Any, Dict, Sequence - -import torch -from transformers import DataCollatorWithPadding - - -@dataclass -class PairwiseDataCollatorWithPadding(DataCollatorWithPadding): - r""" - Data collator for pairwise data. - """ - - def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, torch.Tensor]: - r""" - Pads batched data to the longest sequence in the batch. - - We generate 2 * n examples where the first n examples represent chosen examples and - the last n examples represent rejected examples. - """ - features = [ - { - "input_ids": feature["prompt_ids"] + feature[key], - "attention_mask": [1] * (len(feature["prompt_ids"]) + len(feature[key])), - } - for key in ("chosen_ids", "rejected_ids") - for feature in features - ] - return super().__call__(features) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/utils.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/utils.py deleted file mode 100644 index 425ff18..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/train/utils.py +++ /dev/null @@ -1,246 +0,0 @@ -import math -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union - -import torch -from transformers.optimization import get_scheduler -from transformers.utils.versions import require_version - -from ..extras.logging import get_logger -from ..extras.packages import is_galore_available -from ..hparams import FinetuningArguments, ModelArguments -from ..model import load_model_and_tokenizer, load_valuehead_params - - -if is_galore_available(): - from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit - - -if TYPE_CHECKING: - from datasets import Dataset, IterableDataset - from transformers import Seq2SeqTrainingArguments, Trainer - from transformers.modeling_utils import PreTrainedModel - from trl import AutoModelForCausalLMWithValueHead - - from ..hparams import DataArguments - - -logger = get_logger(__name__) - - -class DummyOptimizer(torch.optim.Optimizer): - def __init__(self, *args, **kwargs): - dummy_tensor = torch.randn(1, 1) - super().__init__([dummy_tensor], {"lr": 1e-3}) - - def zero_grad(self, set_to_none: bool = True) -> None: - pass - - def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: - pass - - -def create_modelcard_and_push( - trainer: "Trainer", - model_args: "ModelArguments", - data_args: "DataArguments", - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", -) -> None: - kwargs = { - "tasks": "text-generation", - "finetuned_from": model_args.model_name_or_path, - "dataset": [dataset.strip() for dataset in data_args.dataset.split(",")], - "tags": ["llama-factory", finetuning_args.finetuning_type], - } - if not training_args.do_train: - pass - elif training_args.push_to_hub: - trainer.push_to_hub(**kwargs) - else: - trainer.create_model_card(license="other", **kwargs) # prevent from connecting to hub - - -def create_ref_model( - model_args: "ModelArguments", finetuning_args: "FinetuningArguments", add_valuehead: bool = False -) -> Union["PreTrainedModel", "AutoModelForCausalLMWithValueHead"]: - r""" - Creates reference model for PPO/DPO training. Evaluation mode is not supported. - - The valuehead parameter is randomly initialized since it is useless for PPO training. - """ - if finetuning_args.ref_model is not None: - ref_model_args_dict = model_args.to_dict() - ref_model_args_dict.update( - dict( - model_name_or_path=finetuning_args.ref_model, - adapter_name_or_path=finetuning_args.ref_model_adapters, - quantization_bit=finetuning_args.ref_model_quantization_bit, - ) - ) - ref_model_args = ModelArguments(**ref_model_args_dict) - ref_finetuning_args = FinetuningArguments(finetuning_type="lora") - ref_model, _ = load_model_and_tokenizer( - ref_model_args, ref_finetuning_args, is_trainable=False, add_valuehead=add_valuehead - ) - logger.info("Created reference model from {}".format(finetuning_args.ref_model)) - else: - if finetuning_args.finetuning_type == "lora": - ref_model = None - else: - ref_model, _ = load_model_and_tokenizer( - model_args, finetuning_args, is_trainable=False, add_valuehead=add_valuehead - ) - logger.info("Created reference model from the model itself.") - - return ref_model - - -def create_reward_model( - model: "AutoModelForCausalLMWithValueHead", model_args: "ModelArguments", finetuning_args: "FinetuningArguments" -) -> "AutoModelForCausalLMWithValueHead": - r""" - Creates reward model for PPO training. - """ - if finetuning_args.reward_model_type == "api": - assert finetuning_args.reward_model.startswith("http"), "Please provide full url." - logger.info("Use reward server {}".format(finetuning_args.reward_model)) - return finetuning_args.reward_model - elif finetuning_args.reward_model_type == "lora": - model.pretrained_model.load_adapter(finetuning_args.reward_model, "reward") - for name, param in model.named_parameters(): # https://github.com/huggingface/peft/issues/1090 - if "default" in name: - param.data = param.data.to(torch.float32) # trainable params should in fp32 - vhead_params = load_valuehead_params(finetuning_args.reward_model, model_args) - assert vhead_params is not None, "Reward model is not correctly loaded." - model.register_buffer("reward_head_weight", vhead_params["v_head.summary.weight"], persistent=False) - model.register_buffer("reward_head_bias", vhead_params["v_head.summary.bias"], persistent=False) - model.register_buffer( - "default_head_weight", torch.zeros_like(vhead_params["v_head.summary.weight"]), persistent=False - ) - model.register_buffer( - "default_head_bias", torch.zeros_like(vhead_params["v_head.summary.bias"]), persistent=False - ) - logger.info("Loaded adapter weights of reward model from {}".format(finetuning_args.reward_model)) - return None - else: - reward_model_args_dict = model_args.to_dict() - reward_model_args_dict.update( - dict( - model_name_or_path=finetuning_args.reward_model, - adapter_name_or_path=finetuning_args.reward_model_adapters, - quantization_bit=finetuning_args.reward_model_quantization_bit, - ) - ) - reward_model_args = ModelArguments(**reward_model_args_dict) - reward_finetuning_args = FinetuningArguments(finetuning_type="lora") - reward_model, _ = load_model_and_tokenizer( - reward_model_args, reward_finetuning_args, is_trainable=False, add_valuehead=True - ) - logger.info("Loaded full weights of reward model from {}".format(finetuning_args.reward_model)) - logger.warning("Please ensure the ppo model and reward model share SAME tokenizer and vocabulary.") - return reward_model - - -def create_custom_optimzer( - model: "PreTrainedModel", - dataset: Union["Dataset", "IterableDataset"], - training_args: "Seq2SeqTrainingArguments", - finetuning_args: "FinetuningArguments", -) -> Optional["torch.optim.Optimizer"]: - if not finetuning_args.use_galore: - return None - - require_version("galore_torch", "To fix: pip install git+https://github.com/hiyouga/GaLore.git") - galore_params: List[torch.nn.Parameter] = [] - galore_targets = finetuning_args.galore_target.split(",") - - for name, module in model.named_modules(): - if isinstance(module, torch.nn.Linear) and any(target in name for target in galore_targets): - for param in module.parameters(): - if param.requires_grad and len(param.shape) > 1: - galore_params.append(param) - - id_galore_params = {id(param) for param in galore_params} - trainable_params = filter(lambda param: param.requires_grad, model.parameters()) - non_galore_params = [param for param in trainable_params if id(param) not in id_galore_params] - - if training_args.optim == "adamw_torch": - optim_class = GaLoreAdamW - optim_kwargs = { - "lr": training_args.learning_rate, - "eps": training_args.adam_epsilon, - "betas": (training_args.adam_beta1, training_args.adam_beta2), - "weight_decay": training_args.weight_decay, - } - - elif training_args.optim in ["adamw_bnb_8bit", "adamw_8bit", "paged_adamw_8bit"]: - optim_class = GaLoreAdamW8bit - optim_kwargs = { - "lr": training_args.learning_rate, - "eps": training_args.adam_epsilon, - "betas": (training_args.adam_beta1, training_args.adam_beta2), - "weight_decay": training_args.weight_decay, - "optim_bits": 8, - "is_paged": "paged" in training_args.optim, - } - - elif training_args.optim == "adafactor": - optim_class = GaLoreAdafactor - optim_kwargs = { - "lr": training_args.learning_rate, - "weight_decay": training_args.weight_decay, - } - - else: - raise NotImplementedError("Unknow optim: {}".format(training_args.optim)) - - galore_kwargs = { - "rank": finetuning_args.galore_rank, - "update_proj_gap": finetuning_args.galore_update_interval, - "scale": finetuning_args.galore_scale, - "proj_type": finetuning_args.galore_proj_type, - } - - if finetuning_args.galore_layerwise: - if training_args.gradient_accumulation_steps != 1: - raise ValueError("Per-layer GaLore does not support gradient accumulation.") - - if training_args.max_steps > 0: - num_training_steps = training_args.max_steps - else: - total_train_batch_size = training_args.per_device_train_batch_size * training_args.world_size - num_training_steps = training_args.num_train_epochs * math.ceil(len(dataset) / total_train_batch_size) - - optimizer_dict: Dict["torch.Tensor", "torch.optim.Optimizer"] = {} - for param in non_galore_params: - param_groups = [dict(params=[param])] - optimizer_dict[param] = optim_class(param_groups, **optim_kwargs) - for param in galore_params: - param_groups = [dict(params=[param], **galore_kwargs)] - optimizer_dict[param] = optim_class(param_groups, **optim_kwargs) - - scheduler_dict: Dict["torch.Tensor", "torch.optim.lr_scheduler.LRScheduler"] = {} - for param in non_galore_params + galore_params: - scheduler_dict[param] = get_scheduler( - training_args.lr_scheduler_type, - optimizer=optimizer_dict[param], - num_warmup_steps=training_args.get_warmup_steps(num_training_steps) * 2, - num_training_steps=num_training_steps * 2, - ) - - def optimizer_hook(param: "torch.Tensor"): - if param.grad is not None: - optimizer_dict[param].step() - optimizer_dict[param].zero_grad() - scheduler_dict[param].step() - - for param in non_galore_params + galore_params: - param.register_post_accumulate_grad_hook(optimizer_hook) - - optimizer = DummyOptimizer() - else: - param_groups = [dict(params=non_galore_params), dict(params=galore_params, **galore_kwargs)] - optimizer = optim_class(param_groups, **optim_kwargs) - - logger.info("Using GaLore optimizer, may cause hanging at the start of training, wait patiently.") - return optimizer diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/__init__.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/__init__.py deleted file mode 100644 index 3e82dd6..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .interface import create_ui, create_web_demo - - -__all__ = ["create_ui", "create_web_demo"] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/common.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/common.py deleted file mode 100644 index 961d6f0..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/common.py +++ /dev/null @@ -1,115 +0,0 @@ -import json -import os -from collections import defaultdict -from typing import Any, Dict, Optional - -import gradio as gr -from peft.utils import SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME - -from ..extras.constants import ( - DATA_CONFIG, - DEFAULT_MODULE, - DEFAULT_TEMPLATE, - PEFT_METHODS, - SUPPORTED_MODELS, - TRAINING_STAGES, - DownloadSource, -) -from ..extras.misc import use_modelscope - - -ADAPTER_NAMES = {WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME} -DEFAULT_CACHE_DIR = "cache" -DEFAULT_DATA_DIR = "data" -DEFAULT_SAVE_DIR = "saves" -USER_CONFIG = "user.config" - - -def get_save_dir(*args) -> os.PathLike: - return os.path.join(DEFAULT_SAVE_DIR, *args) - - -def get_config_path() -> os.PathLike: - return os.path.join(DEFAULT_CACHE_DIR, USER_CONFIG) - - -def load_config() -> Dict[str, Any]: - try: - with open(get_config_path(), "r", encoding="utf-8") as f: - return json.load(f) - except Exception: - return {"lang": None, "last_model": None, "path_dict": {}, "cache_dir": None} - - -def save_config(lang: str, model_name: Optional[str] = None, model_path: Optional[str] = None) -> None: - os.makedirs(DEFAULT_CACHE_DIR, exist_ok=True) - user_config = load_config() - user_config["lang"] = lang or user_config["lang"] - if model_name: - user_config["last_model"] = model_name - user_config["path_dict"][model_name] = model_path - with open(get_config_path(), "w", encoding="utf-8") as f: - json.dump(user_config, f, indent=2, ensure_ascii=False) - - -def get_model_path(model_name: str) -> str: - user_config = load_config() - path_dict: Dict[DownloadSource, str] = SUPPORTED_MODELS.get(model_name, defaultdict(str)) - model_path = user_config["path_dict"].get(model_name, None) or path_dict.get(DownloadSource.DEFAULT, None) - if ( - use_modelscope() - and path_dict.get(DownloadSource.MODELSCOPE) - and model_path == path_dict.get(DownloadSource.DEFAULT) - ): # replace path - model_path = path_dict.get(DownloadSource.MODELSCOPE) - return model_path - - -def get_prefix(model_name: str) -> str: - return model_name.split("-")[0] - - -def get_module(model_name: str) -> str: - return DEFAULT_MODULE.get(get_prefix(model_name), "q_proj,v_proj") - - -def get_template(model_name: str) -> str: - if model_name and model_name.endswith("Chat") and get_prefix(model_name) in DEFAULT_TEMPLATE: - return DEFAULT_TEMPLATE[get_prefix(model_name)] - return "default" - - -def list_adapters(model_name: str, finetuning_type: str) -> Dict[str, Any]: - if finetuning_type not in PEFT_METHODS: - return gr.update(value=[], choices=[], interactive=False) - - adapters = [] - if model_name and finetuning_type == "lora": - save_dir = get_save_dir(model_name, finetuning_type) - if save_dir and os.path.isdir(save_dir): - for adapter in os.listdir(save_dir): - if os.path.isdir(os.path.join(save_dir, adapter)) and any( - os.path.isfile(os.path.join(save_dir, adapter, name)) for name in ADAPTER_NAMES - ): - adapters.append(adapter) - return gr.update(value=[], choices=adapters, interactive=True) - - -def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]: - try: - with open(os.path.join(dataset_dir, DATA_CONFIG), "r", encoding="utf-8") as f: - return json.load(f) - except Exception as err: - print("Cannot open {} due to {}.".format(os.path.join(dataset_dir, DATA_CONFIG), str(err))) - return {} - - -def list_dataset(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Dict[str, Any]: - dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR) - ranking = TRAINING_STAGES[training_stage] in ["rm", "dpo"] - datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking] - return gr.update(value=[], choices=datasets) - - -def autoset_packing(training_stage: str = list(TRAINING_STAGES.keys())[0]) -> Dict[str, Any]: - return gr.update(value=(TRAINING_STAGES[training_stage] == "pt")) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/chatbot.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/chatbot.py deleted file mode 100644 index bf5bb66..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/chatbot.py +++ /dev/null @@ -1,62 +0,0 @@ -from typing import TYPE_CHECKING, Dict, Tuple - -import gradio as gr - -from ...data import Role -from ..utils import check_json_schema - - -if TYPE_CHECKING: - from gradio.blocks import Block - from gradio.components import Component - - from ..engine import Engine - - -def create_chat_box( - engine: "Engine", visible: bool = False -) -> Tuple["Block", "Component", "Component", Dict[str, "Component"]]: - with gr.Box(visible=visible) as chat_box: - chatbot = gr.Chatbot() - messages = gr.State([]) - with gr.Row(): - with gr.Column(scale=4): - role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value=Role.USER.value) - system = gr.Textbox(show_label=False) - tools = gr.Textbox(show_label=False, lines=2) - query = gr.Textbox(show_label=False, lines=8) - submit_btn = gr.Button(variant="primary") - - with gr.Column(scale=1): - max_new_tokens = gr.Slider(8, 4096, value=512, step=1) - top_p = gr.Slider(0.01, 1.0, value=0.7, step=0.01) - temperature = gr.Slider(0.01, 1.5, value=0.95, step=0.01) - clear_btn = gr.Button() - - tools.input(check_json_schema, [tools, engine.manager.get_elem_by_name("top.lang")]) - - submit_btn.click( - engine.chatter.predict, - [chatbot, role, query, messages, system, tools, max_new_tokens, top_p, temperature], - [chatbot, messages], - show_progress=True, - ).then(lambda: gr.update(value=""), outputs=[query]) - - clear_btn.click(lambda: ([], []), outputs=[chatbot, messages], show_progress=True) - - return ( - chat_box, - chatbot, - messages, - dict( - role=role, - system=system, - tools=tools, - query=query, - submit_btn=submit_btn, - max_new_tokens=max_new_tokens, - top_p=top_p, - temperature=temperature, - clear_btn=clear_btn, - ), - ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/top.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/top.py deleted file mode 100644 index d8b4958..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/top.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import TYPE_CHECKING, Dict, Tuple - -import gradio as gr - -from ...data import templates -from ...extras.constants import METHODS, SUPPORTED_MODELS -from ..common import get_model_path, get_template, list_adapters, save_config -from ..utils import can_quantize - - -if TYPE_CHECKING: - from gradio.components import Component - - -def create_top() -> Tuple["gr.Dropdown", Dict[str, "Component"]]: - available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"] - - with gr.Row(): - lang = gr.Dropdown(choices=["en", "ru", "zh"], scale=1) - model_name = gr.Dropdown(choices=available_models, scale=3) - model_path = gr.Textbox(scale=3) - - with gr.Row(): - finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1) - adapter_path = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=5) - refresh_btn = gr.Button(scale=1) - - with gr.Accordion(label="Advanced config", open=False) as advanced_tab: - with gr.Row(): - quantization_bit = gr.Dropdown(choices=["none", "8", "4"], value="none") - template = gr.Dropdown(choices=list(templates.keys()), value="default") - rope_scaling = gr.Radio(choices=["none", "linear", "dynamic"], value="none") - booster = gr.Radio(choices=["none", "flashattn", "unsloth"], value="none") - - model_name.change(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False).then( - get_model_path, [model_name], [model_path], queue=False - ).then(get_template, [model_name], [template], queue=False) # do not save config since the below line will save - - model_path.change(save_config, inputs=[lang, model_name, model_path], queue=False) - - finetuning_type.change(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False).then( - can_quantize, [finetuning_type], [quantization_bit], queue=False - ) - - refresh_btn.click(list_adapters, [model_name, finetuning_type], [adapter_path], queue=False) - - return lang, dict( - lang=lang, - model_name=model_name, - model_path=model_path, - finetuning_type=finetuning_type, - adapter_path=adapter_path, - refresh_btn=refresh_btn, - advanced_tab=advanced_tab, - quantization_bit=quantization_bit, - template=template, - rope_scaling=rope_scaling, - booster=booster, - ) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/train.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/train.py deleted file mode 100644 index 0725f5e..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/components/train.py +++ /dev/null @@ -1,246 +0,0 @@ -from typing import TYPE_CHECKING, Dict - -import gradio as gr -from transformers.trainer_utils import SchedulerType - -from ...extras.constants import TRAINING_STAGES -from ..common import DEFAULT_DATA_DIR, autoset_packing, list_adapters, list_dataset -from ..components.data import create_preview_box -from ..utils import gen_plot - - -if TYPE_CHECKING: - from gradio.components import Component - - from ..engine import Engine - - -def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: - input_elems = engine.manager.get_base_elems() - elem_dict = dict() - - with gr.Row(): - training_stage = gr.Dropdown( - choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=2 - ) - dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=2) - dataset = gr.Dropdown(multiselect=True, scale=4) - preview_elems = create_preview_box(dataset_dir, dataset) - - dataset_dir.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False) - - input_elems.update({training_stage, dataset_dir, dataset}) - elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems)) - - with gr.Row(): - learning_rate = gr.Textbox(value="5e-5") - num_train_epochs = gr.Textbox(value="3.0") - max_grad_norm = gr.Textbox(value="1.0") - max_samples = gr.Textbox(value="100000") - compute_type = gr.Dropdown(choices=["fp16", "bf16", "fp32", "pure_bf16"], value="fp16") - - input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type}) - elem_dict.update( - dict( - learning_rate=learning_rate, - num_train_epochs=num_train_epochs, - max_grad_norm=max_grad_norm, - max_samples=max_samples, - compute_type=compute_type, - ) - ) - - with gr.Row(): - cutoff_len = gr.Slider(value=1024, minimum=4, maximum=16384, step=1) - batch_size = gr.Slider(value=2, minimum=1, maximum=1024, step=1) - gradient_accumulation_steps = gr.Slider(value=8, minimum=1, maximum=1024, step=1) - val_size = gr.Slider(value=0, minimum=0, maximum=1, step=0.001) - lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="cosine") - - input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type}) - elem_dict.update( - dict( - cutoff_len=cutoff_len, - batch_size=batch_size, - gradient_accumulation_steps=gradient_accumulation_steps, - val_size=val_size, - lr_scheduler_type=lr_scheduler_type, - ) - ) - - with gr.Accordion(label="Extra config", open=False) as extra_tab: - with gr.Row(): - logging_steps = gr.Slider(value=5, minimum=5, maximum=1000, step=5) - save_steps = gr.Slider(value=100, minimum=10, maximum=5000, step=10) - warmup_steps = gr.Slider(value=0, minimum=0, maximum=5000, step=1) - neftune_alpha = gr.Slider(value=0, minimum=0, maximum=10, step=0.1) - optim = gr.Textbox(value="adamw_torch") - - with gr.Row(): - resize_vocab = gr.Checkbox() - packing = gr.Checkbox() - upcast_layernorm = gr.Checkbox() - use_llama_pro = gr.Checkbox() - shift_attn = gr.Checkbox() - - input_elems.update( - { - logging_steps, - save_steps, - warmup_steps, - neftune_alpha, - optim, - resize_vocab, - packing, - upcast_layernorm, - use_llama_pro, - shift_attn, - } - ) - elem_dict.update( - dict( - extra_tab=extra_tab, - logging_steps=logging_steps, - save_steps=save_steps, - warmup_steps=warmup_steps, - neftune_alpha=neftune_alpha, - optim=optim, - resize_vocab=resize_vocab, - packing=packing, - upcast_layernorm=upcast_layernorm, - use_llama_pro=use_llama_pro, - shift_attn=shift_attn, - ) - ) - - with gr.Accordion(label="Freeze config", open=False) as freeze_tab: - with gr.Row(): - num_layer_trainable = gr.Slider(value=3, minimum=1, maximum=128, step=1, scale=2) - name_module_trainable = gr.Textbox(value="all", scale=3) - - input_elems.update({num_layer_trainable, name_module_trainable}) - elem_dict.update( - dict( - freeze_tab=freeze_tab, num_layer_trainable=num_layer_trainable, name_module_trainable=name_module_trainable - ) - ) - - with gr.Accordion(label="LoRA config", open=False) as lora_tab: - with gr.Row(): - lora_rank = gr.Slider(value=8, minimum=1, maximum=1024, step=1, scale=1) - lora_alpha = gr.Slider(value=16, minimum=1, maximum=2048, step=1, scale=1) - lora_dropout = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1) - lora_target = gr.Textbox(scale=2) - - with gr.Row(): - use_rslora = gr.Checkbox(scale=1) - use_dora = gr.Checkbox(scale=1) - create_new_adapter = gr.Checkbox(scale=1) - additional_target = gr.Textbox(scale=2) - - input_elems.update( - {lora_rank, lora_alpha, lora_dropout, lora_target, use_rslora, use_dora, create_new_adapter, additional_target} - ) - elem_dict.update( - dict( - lora_tab=lora_tab, - lora_rank=lora_rank, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - lora_target=lora_target, - use_rslora=use_rslora, - use_dora=use_dora, - create_new_adapter=create_new_adapter, - additional_target=additional_target, - ) - ) - - with gr.Accordion(label="RLHF config", open=False) as rlhf_tab: - with gr.Row(): - dpo_beta = gr.Slider(value=0.1, minimum=0, maximum=1, step=0.01, scale=1) - dpo_ftx = gr.Slider(value=0, minimum=0, maximum=10, step=0.01, scale=1) - reward_model = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=2) - - training_stage.change(list_dataset, [dataset_dir, training_stage], [dataset], queue=False).then( - list_adapters, - [engine.manager.get_elem_by_name("top.model_name"), engine.manager.get_elem_by_name("top.finetuning_type")], - [reward_model], - queue=False, - ).then(autoset_packing, [training_stage], [packing], queue=False) - - input_elems.update({dpo_beta, dpo_ftx, reward_model}) - elem_dict.update(dict(rlhf_tab=rlhf_tab, dpo_beta=dpo_beta, dpo_ftx=dpo_ftx, reward_model=reward_model)) - - with gr.Accordion(label="GaLore config", open=False) as galore_tab: - with gr.Row(): - use_galore = gr.Checkbox(scale=1) - galore_rank = gr.Slider(value=16, minimum=1, maximum=1024, step=1, scale=2) - galore_update_interval = gr.Slider(value=200, minimum=1, maximum=1024, step=1, scale=2) - galore_scale = gr.Slider(value=0.25, minimum=0, maximum=1, step=0.01, scale=2) - galore_target = gr.Textbox(value="mlp,attn", scale=3) - - input_elems.update({use_galore, galore_rank, galore_update_interval, galore_scale, galore_target}) - elem_dict.update( - dict( - galore_tab=galore_tab, - use_galore=use_galore, - galore_rank=galore_rank, - galore_update_interval=galore_update_interval, - galore_scale=galore_scale, - galore_target=galore_target, - ) - ) - - with gr.Row(): - cmd_preview_btn = gr.Button() - start_btn = gr.Button() - stop_btn = gr.Button() - - with gr.Row(): - with gr.Column(scale=3): - with gr.Row(): - output_dir = gr.Textbox() - - with gr.Row(): - resume_btn = gr.Checkbox(visible=False, interactive=False) - process_bar = gr.Slider(visible=False, interactive=False) - - with gr.Box(): - output_box = gr.Markdown() - - with gr.Column(scale=1): - loss_viewer = gr.Plot() - - input_elems.add(output_dir) - output_elems = [output_box, process_bar] - - cmd_preview_btn.click(engine.runner.preview_train, input_elems, output_elems) - start_btn.click(engine.runner.run_train, input_elems, output_elems) - stop_btn.click(engine.runner.set_abort, queue=False) - resume_btn.change(engine.runner.monitor, outputs=output_elems) - - elem_dict.update( - dict( - cmd_preview_btn=cmd_preview_btn, - start_btn=start_btn, - stop_btn=stop_btn, - output_dir=output_dir, - resume_btn=resume_btn, - process_bar=process_bar, - output_box=output_box, - loss_viewer=loss_viewer, - ) - ) - - output_box.change( - gen_plot, - [ - engine.manager.get_elem_by_name("top.model_name"), - engine.manager.get_elem_by_name("top.finetuning_type"), - output_dir, - ], - loss_viewer, - queue=False, - ) - - return elem_dict diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/engine.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/engine.py deleted file mode 100644 index fb04ca0..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/engine.py +++ /dev/null @@ -1,62 +0,0 @@ -from typing import Any, Dict, Generator - -import gradio as gr -from gradio.components import Component # cannot use TYPE_CHECKING here - -from .chatter import WebChatModel -from .common import get_model_path, list_dataset, load_config -from .locales import LOCALES -from .manager import Manager -from .runner import Runner -from .utils import get_time - - -class Engine: - def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None: - self.demo_mode = demo_mode - self.pure_chat = pure_chat - self.manager = Manager() - self.runner = Runner(self.manager, demo_mode) - self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat)) - - def _form_dict(self, resume_dict: Dict[str, Dict[str, Any]]): - return {self.manager.get_elem_by_name(k): gr.update(**v) for k, v in resume_dict.items()} - - def resume(self) -> Generator[Dict[Component, Dict[str, Any]], None, None]: - user_config = load_config() if not self.demo_mode else {} - lang = user_config.get("lang", None) or "en" - - init_dict = {"top.lang": {"value": lang}, "infer.chat_box": {"visible": self.chatter.loaded}} - - if not self.pure_chat: - init_dict["train.dataset"] = {"choices": list_dataset()["choices"]} - init_dict["eval.dataset"] = {"choices": list_dataset()["choices"]} - - if user_config.get("last_model", None): - init_dict["top.model_name"] = {"value": user_config["last_model"]} - init_dict["top.model_path"] = {"value": get_model_path(user_config["last_model"])} - - yield self._form_dict(init_dict) - - if not self.pure_chat: - if self.runner.alive and not self.demo_mode: - yield {elem: gr.update(value=value) for elem, value in self.runner.running_data.items()} - if self.runner.do_train: - yield self._form_dict({"train.resume_btn": {"value": True}}) - else: - yield self._form_dict({"eval.resume_btn": {"value": True}}) - else: - yield self._form_dict( - { - "train.output_dir": {"value": "train_" + get_time()}, - "eval.output_dir": {"value": "eval_" + get_time()}, - } - ) - - def change_lang(self, lang: str) -> Dict[Component, Dict[str, Any]]: - return { - component: gr.update(**LOCALES[name][lang]) - for elems in self.manager.all_elems.values() - for name, component in elems.items() - if name in LOCALES - } diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/interface.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/interface.py deleted file mode 100644 index a1f4d53..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/interface.py +++ /dev/null @@ -1,74 +0,0 @@ -import gradio as gr -from transformers.utils.versions import require_version - -from .common import save_config -from .components import ( - create_chat_box, - create_eval_tab, - create_export_tab, - create_infer_tab, - create_top, - create_train_tab, -) -from .css import CSS -from .engine import Engine - - -require_version("gradio>=3.38.0,<4.0.0", 'To fix: pip install "gradio>=3.38.0,<4.0.0"') - - -def create_ui(demo_mode: bool = False) -> gr.Blocks: - engine = Engine(demo_mode=demo_mode, pure_chat=False) - - with gr.Blocks(title="LLaMA Board", css=CSS) as demo: - if demo_mode: - gr.HTML("

LLaMA Board: A One-stop Web UI for Getting Started with LLaMA Factory

") - gr.HTML( - '

Visit ' - "LLaMA Factory for details.

" - ) - gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") - - lang, engine.manager.all_elems["top"] = create_top() - - with gr.Tab("Train"): - engine.manager.all_elems["train"] = create_train_tab(engine) - - with gr.Tab("Evaluate & Predict"): - engine.manager.all_elems["eval"] = create_eval_tab(engine) - - with gr.Tab("Chat"): - engine.manager.all_elems["infer"] = create_infer_tab(engine) - - if not demo_mode: - with gr.Tab("Export"): - engine.manager.all_elems["export"] = create_export_tab(engine) - - demo.load(engine.resume, outputs=engine.manager.list_elems()) - lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False) - lang.input(save_config, inputs=[lang], queue=False) - - return demo - - -def create_web_demo() -> gr.Blocks: - engine = Engine(pure_chat=True) - - with gr.Blocks(title="Web Demo", css=CSS) as demo: - lang = gr.Dropdown(choices=["en", "zh"]) - engine.manager.all_elems["top"] = dict(lang=lang) - - chat_box, _, _, chat_elems = create_chat_box(engine, visible=True) - engine.manager.all_elems["infer"] = dict(chat_box=chat_box, **chat_elems) - - demo.load(engine.resume, outputs=engine.manager.list_elems()) - lang.change(engine.change_lang, [lang], engine.manager.list_elems(), queue=False) - lang.input(save_config, inputs=[lang], queue=False) - - return demo - - -if __name__ == "__main__": - demo = create_ui() - demo.queue() - demo.launch(server_name="0.0.0.0", share=False, inbrowser=True) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/manager.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/manager.py deleted file mode 100644 index 51ddf49..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/manager.py +++ /dev/null @@ -1,33 +0,0 @@ -from typing import TYPE_CHECKING, Dict, List, Set - - -if TYPE_CHECKING: - from gradio.components import Component - - -class Manager: - def __init__(self) -> None: - self.all_elems: Dict[str, Dict[str, "Component"]] = {} - - def get_elem_by_name(self, name: str) -> "Component": - r""" - Example: top.lang, train.dataset - """ - tab_name, elem_name = name.split(".") - return self.all_elems[tab_name][elem_name] - - def get_base_elems(self) -> Set["Component"]: - return { - self.all_elems["top"]["lang"], - self.all_elems["top"]["model_name"], - self.all_elems["top"]["model_path"], - self.all_elems["top"]["adapter_path"], - self.all_elems["top"]["finetuning_type"], - self.all_elems["top"]["quantization_bit"], - self.all_elems["top"]["template"], - self.all_elems["top"]["rope_scaling"], - self.all_elems["top"]["booster"], - } - - def list_elems(self) -> List["Component"]: - return [elem for elems in self.all_elems.values() for elem in elems.values()] diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/runner.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/runner.py deleted file mode 100644 index 1d5396a..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/runner.py +++ /dev/null @@ -1,306 +0,0 @@ -import logging -import os -import time -from threading import Thread -from typing import TYPE_CHECKING, Any, Dict, Generator, Tuple - -import gradio as gr -import transformers -from gradio.components import Component # cannot use TYPE_CHECKING here -from transformers.trainer import TRAINING_ARGS_NAME -from transformers.utils import is_torch_cuda_available - -from ..extras.callbacks import LogCallback -from ..extras.constants import TRAINING_STAGES -from ..extras.logging import LoggerHandler -from ..extras.misc import get_device_count, torch_gc -from ..train import run_exp -from .common import get_module, get_save_dir, load_config -from .locales import ALERTS -from .utils import gen_cmd, get_eval_results, update_process_bar - - -if TYPE_CHECKING: - from .manager import Manager - - -class Runner: - def __init__(self, manager: "Manager", demo_mode: bool = False) -> None: - self.manager = manager - self.demo_mode = demo_mode - """ Resume """ - self.thread: "Thread" = None - self.do_train = True - self.running_data: Dict["Component", Any] = None - """ State """ - self.aborted = False - self.running = False - """ Handler """ - self.logger_handler = LoggerHandler() - self.logger_handler.setLevel(logging.INFO) - logging.root.addHandler(self.logger_handler) - transformers.logging.add_handler(self.logger_handler) - - @property - def alive(self) -> bool: - return self.thread is not None - - def set_abort(self) -> None: - self.aborted = True - - def _initialize(self, data: Dict[Component, Any], do_train: bool, from_preview: bool) -> str: - get = lambda name: data[self.manager.get_elem_by_name(name)] - lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path") - dataset = get("train.dataset") if do_train else get("eval.dataset") - stage = TRAINING_STAGES[get("train.training_stage")] - reward_model = get("train.reward_model") - - if self.running: - return ALERTS["err_conflict"][lang] - - if not model_name: - return ALERTS["err_no_model"][lang] - - if not model_path: - return ALERTS["err_no_path"][lang] - - if len(dataset) == 0: - return ALERTS["err_no_dataset"][lang] - - if stage == "ppo" and not reward_model: - return ALERTS["err_no_reward_model"][lang] - - if not from_preview and self.demo_mode: - return ALERTS["err_demo"][lang] - - if not from_preview and get_device_count() > 1: - return ALERTS["err_device_count"][lang] - - if not from_preview and not is_torch_cuda_available(): - gr.Warning(ALERTS["warn_no_cuda"][lang]) - - self.aborted = False - self.logger_handler.reset() - self.trainer_callback = LogCallback(self) - return "" - - def _finalize(self, lang: str, finish_info: str) -> str: - self.thread = None - self.running_data = None - self.running = False - torch_gc() - if self.aborted: - return ALERTS["info_aborted"][lang] - else: - return finish_info - - def _parse_train_args(self, data: Dict[Component, Any]) -> Dict[str, Any]: - get = lambda name: data[self.manager.get_elem_by_name(name)] - user_config = load_config() - - if get("top.adapter_path"): - adapter_name_or_path = ",".join( - [ - get_save_dir(get("top.model_name"), get("top.finetuning_type"), adapter) - for adapter in get("top.adapter_path") - ] - ) - else: - adapter_name_or_path = None - - args = dict( - stage=TRAINING_STAGES[get("train.training_stage")], - do_train=True, - model_name_or_path=get("top.model_path"), - adapter_name_or_path=adapter_name_or_path, - cache_dir=user_config.get("cache_dir", None), - finetuning_type=get("top.finetuning_type"), - quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, - template=get("top.template"), - rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, - flash_attn=(get("top.booster") == "flashattn"), - use_unsloth=(get("top.booster") == "unsloth"), - dataset_dir=get("train.dataset_dir"), - dataset=",".join(get("train.dataset")), - cutoff_len=get("train.cutoff_len"), - learning_rate=float(get("train.learning_rate")), - num_train_epochs=float(get("train.num_train_epochs")), - max_samples=int(get("train.max_samples")), - per_device_train_batch_size=get("train.batch_size"), - gradient_accumulation_steps=get("train.gradient_accumulation_steps"), - lr_scheduler_type=get("train.lr_scheduler_type"), - max_grad_norm=float(get("train.max_grad_norm")), - logging_steps=get("train.logging_steps"), - save_steps=get("train.save_steps"), - warmup_steps=get("train.warmup_steps"), - neftune_noise_alpha=get("train.neftune_alpha") or None, - optim=get("train.optim"), - resize_vocab=get("train.resize_vocab"), - packing=get("train.packing"), - upcast_layernorm=get("train.upcast_layernorm"), - use_llama_pro=get("train.use_llama_pro"), - shift_attn=get("train.shift_attn"), - use_galore=get("train.use_galore"), - output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")), - fp16=(get("train.compute_type") == "fp16"), - bf16=(get("train.compute_type") == "bf16"), - pure_bf16=(get("train.compute_type") == "pure_bf16"), - ) - args["disable_tqdm"] = True - - if args["finetuning_type"] == "freeze": - args["num_layer_trainable"] = int(get("train.num_layer_trainable")) - args["name_module_trainable"] = get("train.name_module_trainable") - elif args["finetuning_type"] == "lora": - args["lora_rank"] = int(get("train.lora_rank")) - args["lora_alpha"] = int(get("train.lora_alpha")) - args["lora_dropout"] = float(get("train.lora_dropout")) - args["lora_target"] = get("train.lora_target") or get_module(get("top.model_name")) - args["use_rslora"] = get("train.use_rslora") - args["use_dora"] = get("train.use_dora") - args["additional_target"] = get("train.additional_target") or None - if args["stage"] in ["rm", "ppo", "dpo"]: - args["create_new_adapter"] = args["quantization_bit"] is None - else: - args["create_new_adapter"] = get("train.create_new_adapter") - - if args["use_llama_pro"]: - args["num_layer_trainable"] = int(get("train.num_layer_trainable")) - - if args["stage"] == "ppo": - args["reward_model"] = ",".join( - [ - get_save_dir(get("top.model_name"), get("top.finetuning_type"), adapter) - for adapter in get("train.reward_model") - ] - ) - args["reward_model_type"] = "lora" if args["finetuning_type"] == "lora" else "full" - - if args["stage"] == "dpo": - args["dpo_beta"] = get("train.dpo_beta") - args["dpo_ftx"] = get("train.dpo_ftx") - - if get("train.val_size") > 1e-6 and args["stage"] != "ppo": - args["val_size"] = get("train.val_size") - args["evaluation_strategy"] = "steps" - args["eval_steps"] = args["save_steps"] - args["per_device_eval_batch_size"] = args["per_device_train_batch_size"] - args["load_best_model_at_end"] = args["stage"] not in ["rm", "ppo"] - - if args["use_galore"]: - args["galore_rank"] = get("train.galore_rank") - args["galore_update_interval"] = get("train.galore_update_interval") - args["galore_scale"] = get("train.galore_scale") - args["galore_target"] = get("train.galore_target") - - return args - - def _parse_eval_args(self, data: Dict[Component, Any]) -> Dict[str, Any]: - get = lambda name: data[self.manager.get_elem_by_name(name)] - user_config = load_config() - - if get("top.adapter_path"): - adapter_name_or_path = ",".join( - [ - get_save_dir(get("top.model_name"), get("top.finetuning_type"), adapter) - for adapter in get("top.adapter_path") - ] - ) - else: - adapter_name_or_path = None - - args = dict( - stage="sft", - model_name_or_path=get("top.model_path"), - adapter_name_or_path=adapter_name_or_path, - cache_dir=user_config.get("cache_dir", None), - finetuning_type=get("top.finetuning_type"), - quantization_bit=int(get("top.quantization_bit")) if get("top.quantization_bit") in ["8", "4"] else None, - template=get("top.template"), - rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None, - flash_attn=(get("top.booster") == "flashattn"), - use_unsloth=(get("top.booster") == "unsloth"), - dataset_dir=get("eval.dataset_dir"), - dataset=",".join(get("eval.dataset")), - cutoff_len=get("eval.cutoff_len"), - max_samples=int(get("eval.max_samples")), - per_device_eval_batch_size=get("eval.batch_size"), - predict_with_generate=True, - max_new_tokens=get("eval.max_new_tokens"), - top_p=get("eval.top_p"), - temperature=get("eval.temperature"), - output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("eval.output_dir")), - ) - - if get("eval.predict"): - args["do_predict"] = True - else: - args["do_eval"] = True - - return args - - def _preview( - self, data: Dict[Component, Any], do_train: bool - ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - error = self._initialize(data, do_train, from_preview=True) - if error: - gr.Warning(error) - yield error, gr.update(visible=False) - else: - args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) - yield gen_cmd(args), gr.update(visible=False) - - def _launch(self, data: Dict[Component, Any], do_train: bool) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - error = self._initialize(data, do_train, from_preview=False) - if error: - gr.Warning(error) - yield error, gr.update(visible=False) - else: - args = self._parse_train_args(data) if do_train else self._parse_eval_args(data) - run_kwargs = dict(args=args, callbacks=[self.trainer_callback]) - self.do_train, self.running_data = do_train, data - self.thread = Thread(target=run_exp, kwargs=run_kwargs) - self.thread.start() - yield from self.monitor() - - def preview_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - yield from self._preview(data, do_train=True) - - def preview_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - yield from self._preview(data, do_train=False) - - def run_train(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - yield from self._launch(data, do_train=True) - - def run_eval(self, data: Dict[Component, Any]) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - yield from self._launch(data, do_train=False) - - def monitor(self) -> Generator[Tuple[str, Dict[str, Any]], None, None]: - get = lambda name: self.running_data[self.manager.get_elem_by_name(name)] - self.running = True - lang = get("top.lang") - output_dir = get_save_dir( - get("top.model_name"), - get("top.finetuning_type"), - get("{}.output_dir".format("train" if self.do_train else "eval")), - ) - - while self.thread.is_alive(): - time.sleep(2) - if self.aborted: - yield ALERTS["info_aborting"][lang], gr.update(visible=False) - else: - yield self.logger_handler.log, update_process_bar(self.trainer_callback) - - if self.do_train: - if os.path.exists(os.path.join(output_dir, TRAINING_ARGS_NAME)): - finish_info = ALERTS["info_finished"][lang] - else: - finish_info = ALERTS["err_failed"][lang] - else: - if os.path.exists(os.path.join(output_dir, "all_results.json")): - finish_info = get_eval_results(os.path.join(output_dir, "all_results.json")) - else: - finish_info = ALERTS["err_failed"][lang] - - yield self._finalize(lang, finish_info), gr.update(visible=False) diff --git a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/utils.py b/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/utils.py deleted file mode 100644 index 05cdd7f..0000000 --- a/src/AntSK.LLamaFactory/llamafactory/llmtuner/webui/utils.py +++ /dev/null @@ -1,104 +0,0 @@ -import json -import os -from datetime import datetime -from typing import TYPE_CHECKING, Any, Dict - -import gradio as gr - -from ..extras.packages import is_matplotlib_available -from ..extras.ploting import smooth -from .common import get_save_dir -from .locales import ALERTS - - -if TYPE_CHECKING: - from ..extras.callbacks import LogCallback - -if is_matplotlib_available(): - import matplotlib.figure - import matplotlib.pyplot as plt - - -def update_process_bar(callback: "LogCallback") -> Dict[str, Any]: - if not callback.max_steps: - return gr.update(visible=False) - - percentage = round(100 * callback.cur_steps / callback.max_steps, 0) if callback.max_steps != 0 else 100.0 - label = "Running {:d}/{:d}: {} < {}".format( - callback.cur_steps, callback.max_steps, callback.elapsed_time, callback.remaining_time - ) - return gr.update(label=label, value=percentage, visible=True) - - -def get_time() -> str: - return datetime.now().strftime("%Y-%m-%d-%H-%M-%S") - - -def can_quantize(finetuning_type: str) -> Dict[str, Any]: - if finetuning_type != "lora": - return gr.update(value="None", interactive=False) - else: - return gr.update(interactive=True) - - -def check_json_schema(text: str, lang: str) -> None: - try: - tools = json.loads(text) - if tools: - assert isinstance(tools, list) - for tool in tools: - if "name" not in tool: - raise ValueError("Name not found.") - except ValueError: - gr.Warning(ALERTS["err_tool_name"][lang]) - except Exception: - gr.Warning(ALERTS["err_json_schema"][lang]) - - -def gen_cmd(args: Dict[str, Any]) -> str: - args.pop("disable_tqdm", None) - args["plot_loss"] = args.get("do_train", None) - current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0") - cmd_lines = ["CUDA_VISIBLE_DEVICES={} python src/train_bash.py ".format(current_devices)] - for k, v in args.items(): - if v is not None and v is not False and v != "": - cmd_lines.append(" --{} {} ".format(k, str(v))) - cmd_text = "\\\n".join(cmd_lines) - cmd_text = "```bash\n{}\n```".format(cmd_text) - return cmd_text - - -def get_eval_results(path: os.PathLike) -> str: - with open(path, "r", encoding="utf-8") as f: - result = json.dumps(json.load(f), indent=4) - return "```json\n{}\n```\n".format(result) - - -def gen_plot(base_model: str, finetuning_type: str, output_dir: str) -> "matplotlib.figure.Figure": - if not base_model: - return - log_file = get_save_dir(base_model, finetuning_type, output_dir, "trainer_log.jsonl") - if not os.path.isfile(log_file): - return - - plt.close("all") - plt.switch_backend("agg") - fig = plt.figure() - ax = fig.add_subplot(111) - steps, losses = [], [] - with open(log_file, "r", encoding="utf-8") as f: - for line in f: - log_info = json.loads(line) - if log_info.get("loss", None): - steps.append(log_info["current_steps"]) - losses.append(log_info["loss"]) - - if len(losses) == 0: - return None - - ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original") - ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed") - ax.legend() - ax.set_xlabel("step") - ax.set_ylabel("loss") - return fig diff --git a/src/AntSK.LLamaFactory/llamafactory/train.py b/src/AntSK.LLamaFactory/llamafactory/train.py new file mode 100644 index 0000000..b20aa9d --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/train.py @@ -0,0 +1,14 @@ +from llamafactory.train.tuner import run_exp + + +def main(): + run_exp() + + +def _mp_fn(index): + # For xla_spawn (TPUs) + run_exp() + + +if __name__ == "__main__": + main() diff --git a/src/AntSK.LLamaFactory/llamafactory/webui.py b/src/AntSK.LLamaFactory/llamafactory/webui.py new file mode 100644 index 0000000..bbefb54 --- /dev/null +++ b/src/AntSK.LLamaFactory/llamafactory/webui.py @@ -0,0 +1,13 @@ +import os + +from llamafactory.webui.interface import create_ui + + +def main(): + gradio_share = os.environ.get("GRADIO_SHARE", "0").lower() in ["true", "1"] + server_name = os.environ.get("GRADIO_SERVER_NAME", "0.0.0.0") + create_ui().queue().launch(share=gradio_share, server_name=server_name, inbrowser=True) + + +if __name__ == "__main__": + main() diff --git a/src/AntSK.LLamaFactory/modelList.json b/src/AntSK.LLamaFactory/modelList.json index 7408937..52f36e3 100644 --- a/src/AntSK.LLamaFactory/modelList.json +++ b/src/AntSK.LLamaFactory/modelList.json @@ -1,21 +1,16 @@ [ { "models": { - "Baichuan-7B-Base": { - "DEFAULT": "baichuan-inc/Baichuan-7B", - "MODELSCOPE": "baichuan-inc/baichuan-7B" + "Aya-23-8B-Chat": { + "DEFAULT": "CohereForAI/aya-23-8B", + "MODELSCOPE": "CohereForAI/aya-23-8B" }, - "Baichuan-13B-Base": { - "DEFAULT": "baichuan-inc/Baichuan-13B-Base", - "MODELSCOPE": "baichuan-inc/Baichuan-13B-Base" - }, - "Baichuan-13B-Chat": { - "DEFAULT": "baichuan-inc/Baichuan-13B-Chat", - "MODELSCOPE": "baichuan-inc/Baichuan-13B-Chat" + "Aya-23-35B-Chat": { + "DEFAULT": "CohereForAI/aya-23-35B", + "MODELSCOPE": "CohereForAI/aya-23-35B" } }, - "module": "W_pack", - "template": "baichuan" + "template": "aya" }, { "models": { @@ -36,7 +31,6 @@ "MODELSCOPE": "baichuan-inc/Baichuan2-13B-Chat" } }, - "module": "W_pack", "template": "baichuan2" }, { @@ -53,8 +47,7 @@ "DEFAULT": "bigscience/bloom-7b1", "MODELSCOPE": "AI-ModelScope/bloom-7b1" } - }, - "module": "query_key_value" + } }, { "models": { @@ -70,8 +63,7 @@ "DEFAULT": "bigscience/bloomz-7b1-mt", "MODELSCOPE": "AI-ModelScope/bloomz-7b1-mt" } - }, - "module": "query_key_value" + } }, { "models": { @@ -86,6 +78,17 @@ }, "template": "bluelm" }, + { + "models": { + "Breeze-7B": { + "DEFAULT": "MediaTek-Research/Breeze-7B-Base-v1_0" + }, + "Breeze-7B-Chat": { + "DEFAULT": "MediaTek-Research/Breeze-7B-Instruct-v1_0" + } + }, + "template": "breeze" + }, { "models": { "ChatGLM2-6B-Chat": { @@ -93,7 +96,6 @@ "MODELSCOPE": "ZhipuAI/chatglm2-6b" } }, - "module": "query_key_value", "template": "chatglm2" }, { @@ -107,7 +109,6 @@ "MODELSCOPE": "ZhipuAI/chatglm3-6b" } }, - "module": "query_key_value", "template": "chatglm3" }, { @@ -139,6 +140,69 @@ }, "template": "llama2_zh" }, + { + "models": { + "CodeGemma-7B": { + "DEFAULT": "google/codegemma-7b", + "MODELSCOPE": "google/codegemma-7b" + }, + "CodeGemma-7B-Chat": { + "DEFAULT": "google/codegemma-7b-it", + "MODELSCOPE": "AI-ModelScope/codegemma-7b-it" + }, + "CodeGemma-1.1-2B": { + "DEFAULT": "google/codegemma-1.1-2b", + "MODELSCOPE": "google/codegemma-1.1-2b" + + }, + "CodeGemma-1.1-7B-Chat": { + "DEFAULT": "google/codegemma-1.1-7b-it", + "MODELSCOPE": "google/codegemma-1.1-7b-it" + } + }, + "template": "gemma" + }, + { + "models": { + "Codestral-22B-v0.1-Chat": { + "DEFAULT": "mistralai/Codestral-22B-v0.1" + } + }, + "template": "mistral" + }, + { + "models": { + "CommandR-35B-Chat": { + "DEFAULT": "CohereForAI/c4ai-command-r-v01", + "MODELSCOPE": "AI-ModelScope/c4ai-command-r-v01" + }, + "CommandR-Plus-104B-Chat": { + "DEFAULT": "CohereForAI/c4ai-command-r-plus", + "MODELSCOPE": "AI-ModelScope/c4ai-command-r-plus" + }, + "CommandR-35B-4bit-Chat": { + "DEFAULT": "CohereForAI/c4ai-command-r-v01-4bit", + "MODELSCOPE": "mirror013/c4ai-command-r-v01-4bit" + }, + "CommandR-Plus-104B-4bit-Chat": { + "DEFAULT": "CohereForAI/c4ai-command-r-plus-4bit" + } + }, + "template": "cohere" + }, + { + "models": { + "DBRX-132B-Base": { + "DEFAULT": "databricks/dbrx-base", + "MODELSCOPE": "AI-ModelScope/dbrx-base" + }, + "DBRX-132B-Chat": { + "DEFAULT": "databricks/dbrx-instruct", + "MODELSCOPE": "AI-ModelScope/dbrx-instruct" + } + }, + "template": "dbrx" + }, { "models": { "DeepSeek-LLM-7B-Base": { @@ -157,15 +221,37 @@ "DEFAULT": "deepseek-ai/deepseek-llm-67b-chat", "MODELSCOPE": "deepseek-ai/deepseek-llm-67b-chat" }, - "DeepSeek-Math-7B-Base": { "DEFAULT": "deepseek-ai/deepseek-math-7b-base" }, - "DeepSeek-Math-7B-Chat": { "DEFAULT": "deepseek-ai/deepseek-math-7b-instruct" }, + "DeepSeek-Math-7B-Base": { + "DEFAULT": "deepseek-ai/deepseek-math-7b-base", + "MODELSCOPE": "deepseek-ai/deepseek-math-7b-base" + }, + "DeepSeek-Math-7B-Chat": { + "DEFAULT": "deepseek-ai/deepseek-math-7b-instruct", + "MODELSCOPE": "deepseek-ai/deepseek-math-7b-instruct" + }, "DeepSeek-MoE-16B-Base": { "DEFAULT": "deepseek-ai/deepseek-moe-16b-base", "MODELSCOPE": "deepseek-ai/deepseek-moe-16b-base" }, + "DeepSeek-MoE-16B-v2-Base": { + "DEFAULT": "deepseek-ai/DeepSeek-V2-Lite", + "MODELSCOPE": "deepseek-ai/DeepSeek-V2-Lite" + }, + "DeepSeek-MoE-236B-Base": { + "DEFAULT": "deepseek-ai/DeepSeek-V2", + "MODELSCOPE": "deepseek-ai/DeepSeek-V2" + }, "DeepSeek-MoE-16B-Chat": { "DEFAULT": "deepseek-ai/deepseek-moe-16b-chat", "MODELSCOPE": "deepseek-ai/deepseek-moe-16b-chat" + }, + "DeepSeek-MoE-16B-v2-Chat": { + "DEFAULT": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "MODELSCOPE": "deepseek-ai/DeepSeek-V2-Lite-Chat" + }, + "DeepSeek-MoE-236B-Chat": { + "DEFAULT": "deepseek-ai/DeepSeek-V2-Chat", + "MODELSCOPE": "deepseek-ai/DeepSeek-V2-Chat" } }, "template": "deepseek" @@ -176,7 +262,9 @@ "DEFAULT": "deepseek-ai/deepseek-coder-6.7b-base", "MODELSCOPE": "deepseek-ai/deepseek-coder-6.7b-base" }, - "DeepSeekCoder-7B-Base": { "DEFAULT": "deepseek-ai/deepseek-coder-7b-base-v1.5" }, + "DeepSeekCoder-7B-Base": { + "DEFAULT": "deepseek-ai/deepseek-coder-7b-base-v1.5" + }, "DeepSeekCoder-33B-Base": { "DEFAULT": "deepseek-ai/deepseek-coder-33b-base", "MODELSCOPE": "deepseek-ai/deepseek-coder-33b-base" @@ -185,7 +273,9 @@ "DEFAULT": "deepseek-ai/deepseek-coder-6.7b-instruct", "MODELSCOPE": "deepseek-ai/deepseek-coder-6.7b-instruct" }, - "DeepSeekCoder-7B-Chat": { "DEFAULT": "deepseek-ai/deepseek-coder-7b-instruct-v1.5" }, + "DeepSeekCoder-7B-Chat": { + "DEFAULT": "deepseek-ai/deepseek-coder-7b-instruct-v1.5" + }, "DeepSeekCoder-33B-Chat": { "DEFAULT": "deepseek-ai/deepseek-coder-33b-instruct", "MODELSCOPE": "deepseek-ai/deepseek-coder-33b-instruct" @@ -199,6 +289,9 @@ "DEFAULT": "tiiuae/falcon-7b", "MODELSCOPE": "AI-ModelScope/falcon-7b" }, + "Falcon-11B": { + "DEFAULT": "tiiuae/falcon-11B" + }, "Falcon-40B": { "DEFAULT": "tiiuae/falcon-40b", "MODELSCOPE": "AI-ModelScope/falcon-40b" @@ -220,7 +313,6 @@ "MODELSCOPE": "modelscope/falcon-180B-chat" } }, - "module": "query_key_value", "template": "falcon" }, { @@ -240,30 +332,32 @@ "Gemma-7B-Chat": { "DEFAULT": "google/gemma-7b-it", "MODELSCOPE": "AI-ModelScope/gemma-7b-it" + }, + "Gemma-1.1-2B-Chat": { + "DEFAULT": "google/gemma-1.1-2b-it" + }, + "Gemma-1.1-7B-Chat": { + "DEFAULT": "google/gemma-1.1-7b-it" } }, "template": "gemma" }, { "models": { - "InternLM-7B": { - "DEFAULT": "internlm/internlm-7b", - "MODELSCOPE": "Shanghai_AI_Laboratory/internlm-7b" + "GLM-4-9B": { + "DEFAULT": "THUDM/glm-4-9b", + "MODELSCOPE": "ZhipuAI/glm-4-9b" }, - "InternLM-20B": { - "DEFAULT": "internlm/internlm-20b", - "MODELSCOPE": "Shanghai_AI_Laboratory/internlm-20b" + "GLM-4-9B-Chat": { + "DEFAULT": "THUDM/glm-4-9b-chat", + "MODELSCOPE": "ZhipuAI/glm-4-9b-chat" }, - "InternLM-7B-Chat": { - "DEFAULT": "internlm/internlm-chat-7b", - "MODELSCOPE": "Shanghai_AI_Laboratory/internlm-chat-7b" - }, - "InternLM-20B-Chat": { - "DEFAULT": "internlm/internlm-chat-20b", - "MODELSCOPE": "Shanghai_AI_Laboratory/internlm-chat-20b" + "GLM-4-9B-1M-Chat": { + "DEFAULT": "THUDM/glm-4-9b-chat-1m", + "MODELSCOPE": "ZhipuAI/glm-4-9b-chat-1m" } }, - "template": "intern" + "template": "glm4" }, { "models": { @@ -284,9 +378,17 @@ "MODELSCOPE": "Shanghai_AI_Laboratory/internlm2-chat-20b" } }, - "module": "wqkv", "template": "intern2" }, + { + "models": { + "Jambda-v0.1": { + "DEFAULT": "ai21labs/Jamba-v0.1", + "MODELSCOPE": "AI-ModelScope/Jamba-v0.1" + } + }, + "template": "Jamba" + }, { "models": { "LingoWhale-8B": { @@ -294,7 +396,7 @@ "MODELSCOPE": "DeepLang/LingoWhale-8B" } }, - "module": "qkv_proj" + "template": "LingoWhale" }, { "models": { @@ -314,7 +416,8 @@ "DEFAULT": "huggyllama/llama-65b", "MODELSCOPE": "skyline2006/llama-65b" } - } + }, + "template": "llama" }, { "models": { @@ -347,55 +450,113 @@ }, { "models": { - "Mistral-7B": { + "LLaMA3-8B": { + "DEFAULT": "meta-llama/Meta-Llama-3-8B", + "MODELSCOPE": "LLM-Research/Meta-Llama-3-8B" + }, + "LLaMA3-70B": { + "DEFAULT": "meta-llama/Meta-Llama-3-70B", + "MODELSCOPE": "LLM-Research/Meta-Llama-3-70B" + }, + "LLaMA3-8B-Chat": { + "DEFAULT": "meta-llama/Meta-Llama-3-8B-Instruct", + "MODELSCOPE": "LLM-Research/Meta-Llama-3-8B-Instruct" + }, + "LLaMA3-70B-Chat": { + "DEFAULT": "meta-llama/Meta-Llama-3-70B-Instruct", + "MODELSCOPE": "LLM-Research/Meta-Llama-3-70B-Instruct" + }, + "LLaMA3-8B-Chinese-Chat": { + "DEFAULT": "shenzhi-wang/Llama3-8B-Chinese-Chat", + "MODELSCOPE": "LLM-Research/Llama3-8B-Chinese-Chat" + }, + "LLaMA3-70B-Chinese-Chat": { + "DEFAULT": "shenzhi-wang/Llama3-70B-Chinese-Chat" + } + }, + "template": "llama3" + }, + { + "models": { + "Mistral-7B-v0.1": { "DEFAULT": "mistralai/Mistral-7B-v0.1", "MODELSCOPE": "AI-ModelScope/Mistral-7B-v0.1" }, - "Mistral-7B-Chat": { + "Mistral-7B-v0.1-Chat": { "DEFAULT": "mistralai/Mistral-7B-Instruct-v0.1", "MODELSCOPE": "AI-ModelScope/Mistral-7B-Instruct-v0.1" }, + "Mistral-7B-v0.2": { + "DEFAULT": "alpindale/Mistral-7B-v0.2-hf", + "MODELSCOPE": "AI-ModelScope/Mistral-7B-v0.2-hf" + }, "Mistral-7B-v0.2-Chat": { "DEFAULT": "mistralai/Mistral-7B-Instruct-v0.2", "MODELSCOPE": "AI-ModelScope/Mistral-7B-Instruct-v0.2" + }, + "Mistral-7B-v0.3": { + "DEFAULT": "mistralai/Mistral-7B-v0.3" + }, + "Mistral-7B-v0.3-Chat": { + "DEFAULT": "mistralai/Mistral-7B-Instruct-v0.3" } }, "template": "mistral" }, { "models": { - "Mixtral-8x7B": { + "Mixtral-8x7B-v0.1": { "DEFAULT": "mistralai/Mixtral-8x7B-v0.1", "MODELSCOPE": "AI-ModelScope/Mixtral-8x7B-v0.1" }, - "Mixtral-8x7B-Chat": { + "Mixtral-8x7B-v0.1-Chat": { "DEFAULT": "mistralai/Mixtral-8x7B-Instruct-v0.1", "MODELSCOPE": "AI-ModelScope/Mixtral-8x7B-Instruct-v0.1" + }, + "Mixtral-8x22B-v0.1": { + "DEFAULT": "mistralai/Mixtral-8x22B-v0.1", + "MODELSCOPE": "AI-ModelScope/Mixtral-8x22B-v0.1" + }, + "Mixtral-8x22B-v0.1-Chat": { + "DEFAULT": "mistralai/Mixtral-8x22B-Instruct-v0.1", + "MODELSCOPE": "AI-ModelScope/Mixtral-8x22B-Instruct-v0.1" } }, "template": "mistral" }, { "models": { - "OLMo-1B": { "DEFAULT": "allenai/OLMo-1B" }, - "OLMo-7B": { - "DEFAULT": "allenai/OLMo-7B", - "MODELSCOPE": "AI-ModelScope/OLMo-7B" + "OLMo-1B": { + "DEFAULT": "allenai/OLMo-1B-hf" }, - "OLMo-7B-Chat": { "DEFAULT": "allenai/OLMo-7B-Instruct" } - }, - "module": "att_proj", - "template": "olmo" + "OLMo-7B": { + "DEFAULT": "allenai/OLMo-7B-hf" + }, + "OLMo-7B-Chat": { + "DEFAULT": "ssec-uw/OLMo-7B-Instruct-hf" + }, + "OLMo-1.7-7B": { + "DEFAULT": "allenai/OLMo-1.7-7B-hf" + } + } }, { "models": { "OpenChat3.5-7B-Chat": { "DEFAULT": "openchat/openchat-3.5-0106", - "MODELSCOPE": "myxiongmodel/openchat_3.5" + "MODELSCOPE": "xcwzxcwz/openchat-3.5-0106" } }, "template": "openchat" }, + { + "models": { + "OpenChat3.6-8B-Chat": { + "DEFAULT": "openchat/openchat-3.6-8b-20240522" + } + }, + "template": "openchat-3.6" + }, { "models": { "Orion-14B-Base": { @@ -433,6 +594,35 @@ } } }, + { + "models": { + "Phi3-4B-4k-Chat": { + "DEFAULT": "microsoft/Phi-3-mini-4k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-mini-4k-instruct" + }, + "Phi3-4B-128k-Chat": { + "DEFAULT": "microsoft/Phi-3-mini-128k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-mini-128k-instruct" + }, + "Phi3-7B-8k-Chat": { + "DEFAULT": "microsoft/Phi-3-small-8k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-small-8k-instruct" + }, + "Phi3-7B-128k-Chat": { + "DEFAULT": "microsoft/Phi-3-small-128k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-small-128k-instruct" + }, + "Phi3-14B-8k-Chat": { + "DEFAULT": "microsoft/Phi-3-medium-4k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-medium-4k-instruct" + }, + "Phi3-14B-128k-Chat": { + "DEFAULT": "microsoft/Phi-3-medium-128k-instruct", + "MODELSCOPE": "LLM-Research/Phi-3-medium-128k-instruct" + } + }, + "template": "phi" + }, { "models": { "Qwen-1.8B": { @@ -500,7 +690,6 @@ "MODELSCOPE": "qwen/Qwen-72B-Chat-Int4" } }, - "module": "c_attn", "template": "qwen" }, { @@ -525,10 +714,26 @@ "DEFAULT": "Qwen/Qwen1.5-14B", "MODELSCOPE": "qwen/Qwen1.5-14B" }, + "Qwen1.5-32B": { + "DEFAULT": "Qwen/Qwen1.5-32B", + "MODELSCOPE": "qwen/Qwen1.5-32B" + }, "Qwen1.5-72B": { "DEFAULT": "Qwen/Qwen1.5-72B", "MODELSCOPE": "qwen/Qwen1.5-72B" }, + "Qwen1.5-110B": { + "DEFAULT": "Qwen/Qwen1.5-110B", + "MODELSCOPE": "qwen/Qwen1.5-110B" + }, + "Qwen1.5-MoE-A2.7B": { + "DEFAULT": "Qwen/Qwen1.5-MoE-A2.7B", + "MODELSCOPE": "qwen/Qwen1.5-MoE-A2.7B" + }, + "Qwen1.5-Code-7B": { + "DEFAULT": "Qwen/CodeQwen1.5-7B", + "MODELSCOPE": "qwen/CodeQwen1.5-7B" + }, "Qwen1.5-0.5B-Chat": { "DEFAULT": "Qwen/Qwen1.5-0.5B-Chat", "MODELSCOPE": "qwen/Qwen1.5-0.5B-Chat" @@ -549,10 +754,26 @@ "DEFAULT": "Qwen/Qwen1.5-14B-Chat", "MODELSCOPE": "qwen/Qwen1.5-14B-Chat" }, + "Qwen1.5-32B-Chat": { + "DEFAULT": "Qwen/Qwen1.5-32B-Chat", + "MODELSCOPE": "qwen/Qwen1.5-32B-Chat" + }, "Qwen1.5-72B-Chat": { "DEFAULT": "Qwen/Qwen1.5-72B-Chat", "MODELSCOPE": "qwen/Qwen1.5-72B-Chat" }, + "Qwen1.5-110B-Chat": { + "DEFAULT": "Qwen/Qwen1.5-110B-Chat", + "MODELSCOPE": "qwen/Qwen1.5-110B-Chat" + }, + "Qwen1.5-MoE-A2.7B-Chat": { + "DEFAULT": "Qwen/Qwen1.5-MoE-A2.7B-Chat", + "MODELSCOPE": "qwen/Qwen1.5-MoE-A2.7B-Chat" + }, + "Qwen1.5-Code-7B-Chat": { + "DEFAULT": "Qwen/CodeQwen1.5-7B-Chat", + "MODELSCOPE": "qwen/CodeQwen1.5-7B-Chat" + }, "Qwen1.5-0.5B-int8-Chat": { "DEFAULT": "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8", "MODELSCOPE": "qwen/Qwen1.5-0.5B-Chat-GPTQ-Int8" @@ -593,6 +814,10 @@ "DEFAULT": "Qwen/Qwen1.5-14B-Chat-AWQ", "MODELSCOPE": "qwen/Qwen1.5-14B-Chat-AWQ" }, + "Qwen1.5-32B-int4-Chat": { + "DEFAULT": "Qwen/Qwen1.5-32B-Chat-AWQ", + "MODELSCOPE": "qwen/Qwen1.5-32B-Chat-AWQ" + }, "Qwen1.5-72B-int8-Chat": { "DEFAULT": "Qwen/Qwen1.5-72B-Chat-GPTQ-Int8", "MODELSCOPE": "qwen/Qwen1.5-72B-Chat-GPTQ-Int8" @@ -600,13 +825,108 @@ "Qwen1.5-72B-int4-Chat": { "DEFAULT": "Qwen/Qwen1.5-72B-Chat-AWQ", "MODELSCOPE": "qwen/Qwen1.5-72B-Chat-AWQ" + }, + "Qwen1.5-110B-int4-Chat": { + "DEFAULT": "Qwen/Qwen1.5-110B-Chat-AWQ", + "MODELSCOPE": "qwen/Qwen1.5-110B-Chat-AWQ" + }, + "Qwen1.5-MoE-A2.7B-int4-Chat": { + "DEFAULT": "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4", + "MODELSCOPE": "qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4" + }, + "Qwen1.5-Code-7B-int4-Chat": { + "DEFAULT": "Qwen/CodeQwen1.5-7B-Chat-AWQ", + "MODELSCOPE": "qwen/CodeQwen1.5-7B-Chat-AWQ" } }, "template": "qwen" }, { "models": { - "SOLAR-10.7B": { "DEFAULT": "upstage/SOLAR-10.7B-v1.0" }, + "Qwen2-0.5B": { + "DEFAULT": "Qwen/Qwen2-0.5B", + "MODELSCOPE": "qwen/Qwen2-0.5B" + }, + "Qwen2-1.5B": { + "DEFAULT": "Qwen/Qwen2-1.5B", + "MODELSCOPE": "qwen/Qwen2-1.5B" + }, + "Qwen2-7B": { + "DEFAULT": "Qwen/Qwen2-7B", + "MODELSCOPE": "qwen/Qwen2-7B" + }, + "Qwen2-72B": { + "DEFAULT": "Qwen/Qwen2-72B", + "MODELSCOPE": "qwen/Qwen2-72B" + }, + "Qwen2-MoE-57B": { + "DEFAULT": "Qwen/Qwen2-57B-A14B", + "MODELSCOPE": "qwen/Qwen2-57B-A14B" + }, + "Qwen2-0.5B-Chat": { + "DEFAULT": "Qwen/Qwen2-0.5B-Instruct", + "MODELSCOPE": "qwen/Qwen2-0.5B-Instruct" + }, + "Qwen2-1.5B-Chat": { + "DEFAULT": "Qwen/Qwen2-1.5B-Instruct", + "MODELSCOPE": "qwen/Qwen2-1.5B-Instruct" + }, + "Qwen2-7B-Chat": { + "DEFAULT": "Qwen/Qwen2-7B-Instruct", + "MODELSCOPE": "qwen/Qwen2-7B-Instruct" + }, + "Qwen2-72B-Chat": { + "DEFAULT": "Qwen/Qwen2-72B-Instruct", + "MODELSCOPE": "qwen/Qwen2-72B-Instruct" + }, + "Qwen2-MoE-57B-Chat": { + "DEFAULT": "Qwen/Qwen2-57B-A14B-Instruct", + "MODELSCOPE": "qwen/Qwen2-57B-A14B-Instruct" + }, + "Qwen2-0.5B-int8-Chat": { + "DEFAULT": "Qwen/Qwen2-0.5B-Instruct-GPTQ-Int8", + "MODELSCOPE": "qwen/Qwen2-0.5B-Instruct-GPTQ-Int8" + }, + "Qwen2-0.5B-int4-Chat": { + "DEFAULT": "Qwen/Qwen2-0.5B-Instruct-AWQ", + "MODELSCOPE": "qwen/Qwen2-0.5B-Instruct-AWQ" + }, + "Qwen2-1.5B-int8-Chat": { + "DEFAULT": "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int8", + "MODELSCOPE": "qwen/Qwen2-1.5B-Instruct-GPTQ-Int8" + }, + "Qwen2-1.5B-int4-Chat": { + "DEFAULT": "Qwen/Qwen2-1.5B-Instruct-AWQ", + "MODELSCOPE": "qwen/Qwen2-1.5B-Instruct-AWQ" + }, + "Qwen2-7B-int8-Chat": { + "DEFAULT": "Qwen/Qwen2-7B-Instruct-GPTQ-Int8", + "MODELSCOPE": "qwen/Qwen2-7B-Instruct-GPTQ-Int8" + }, + "Qwen2-7B-int4-Chat": { + "DEFAULT": "Qwen/Qwen2-7B-Instruct-AWQ", + "MODELSCOPE": "qwen/Qwen2-7B-Instruct-AWQ" + }, + "Qwen2-72B-int8-Chat": { + "DEFAULT": "Qwen/Qwen2-72B-Instruct-GPTQ-Int8", + "MODELSCOPE": "qwen/Qwen2-72B-Instruct-GPTQ-Int8" + }, + "Qwen2-72B-int4-Chat": { + "DEFAULT": "Qwen/Qwen2-72B-Instruct-AWQ", + "MODELSCOPE": "qwen/Qwen2-72B-Instruct-AWQ" + }, + "Qwen2-MoE-57B-int4-Chat": { + "DEFAULT": "Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4", + "MODELSCOPE": "qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4" + } + }, + "template": "qwen" + }, + { + "models": { + "SOLAR-10.7B": { + "DEFAULT": "upstage/SOLAR-10.7B-v1.0" + }, "SOLAR-10.7B-Chat": { "DEFAULT": "upstage/SOLAR-10.7B-Instruct-v1.0", "MODELSCOPE": "AI-ModelScope/SOLAR-10.7B-Instruct-v1.0" @@ -624,11 +944,37 @@ }, { "models": { - "StarCoder2-3B": { "DEFAULT": "bigcode/starcoder2-3b" }, - "StarCoder2-7B": { "DEFAULT": "bigcode/starcoder2-7b" }, - "StarCoder2-15B": { "DEFAULT": "bigcode/starcoder2-15b" } + "StarCoder2-3B": { + "DEFAULT": "bigcode/starcoder2-3b", + "MODELSCOPE": "AI-ModelScope/starcoder2-3b" + }, + "StarCoder2-7B": { + "DEFAULT": "bigcode/starcoder2-7b", + "MODELSCOPE": "AI-ModelScope/starcoder2-7b" + }, + "StarCoder2-15B": { + "DEFAULT": "bigcode/starcoder2-15b", + "MODELSCOPE": "AI-ModelScope/starcoder2-15b" + } } }, + { + "models": { + "TeleChat-7B-Chat": { + "DEFAULT": "Tele-AI/telechat-7B", + "MODELSCOPE": "TeleAI/telechat-7B" + }, + "TeleChat-12B-Chat": { + "DEFAULT": "Tele-AI/TeleChat-12B", + "MODELSCOPE": "TeleAI/TeleChat-12B" + }, + "TeleChat-12B-v2-Chat": { + "DEFAULT": "Tele-AI/TeleChat-12B-v2", + "MODELSCOPE": "TeleAI/TeleChat-12B-v2" + } + }, + "template": "telechat" + }, { "models": { "Vicuna1.5-7B-Chat": { @@ -644,10 +990,54 @@ }, { "models": { - "XuanYuan-70B": { "DEFAULT": "Duxiaoman-DI/XuanYuan-70B" }, - "XuanYuan-70B-Chat": { "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat" }, - "XuanYuan-70B-int8-Chat": { "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat-8bit" }, - "XuanYuan-70B-int4-Chat": { "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat-4bit" } + "XuanYuan-6B": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-6B", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-6B" + }, + "XuanYuan-70B": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-70B", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-70B" + }, + "XuanYuan-2-70B": { + "DEFAULT": "Duxiaoman-DI/XuanYuan2-70B", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan2-70B" + }, + "XuanYuan-6B-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-6B-Chat", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-6B-Chat" + }, + "XuanYuan-70B-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-70B-Chat" + }, + "XuanYuan-2-70B-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan2-70B-Chat", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan2-70B-Chat" + }, + "XuanYuan-6B-int8-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-6B-Chat-8bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-6B-Chat-8bit" + }, + "XuanYuan-6B-int4-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-6B-Chat-4bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-6B-Chat-4bit" + }, + "XuanYuan-70B-int8-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat-8bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-70B-Chat-8bit" + }, + "XuanYuan-70B-int4-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan-70B-Chat-4bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan-70B-Chat-4bit" + }, + "XuanYuan-2-70B-int8-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan2-70B-Chat-8bit" + }, + "XuanYuan-2-70B-int4-Chat": { + "DEFAULT": "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit", + "MODELSCOPE": "Duxiaoman-DI/XuanYuan2-70B-Chat-4bit" + } }, "template": "xuanyuan" }, @@ -680,6 +1070,30 @@ "XVERSE-65B-Chat": { "DEFAULT": "xverse/XVERSE-65B-Chat", "MODELSCOPE": "xverse/XVERSE-65B-Chat" + }, + "XVERSE-MoE-A4.2B": { + "DEFAULT": "xverse/XVERSE-MoE-A4.2B", + "MODELSCOPE": "xverse/XVERSE-MoE-A4.2B" + }, + "XVERSE-7B-int8-Chat": { + "DEFAULT": "xverse/XVERSE-7B-Chat-GPTQ-Int8", + "MODELSCOPE": "xverse/XVERSE-7B-Chat-GPTQ-Int8" + }, + "XVERSE-7B-int4-Chat": { + "DEFAULT": "xverse/XVERSE-7B-Chat-GPTQ-Int4", + "MODELSCOPE": "xverse/XVERSE-7B-Chat-GPTQ-Int4" + }, + "XVERSE-13B-int8-Chat": { + "DEFAULT": "xverse/XVERSE-13B-Chat-GPTQ-Int8", + "MODELSCOPE": "xverse/XVERSE-13B-Chat-GPTQ-Int8" + }, + "XVERSE-13B-int4-Chat": { + "DEFAULT": "xverse/XVERSE-13B-Chat-GPTQ-Int4", + "MODELSCOPE": "xverse/XVERSE-13B-Chat-GPTQ-Int4" + }, + "XVERSE-65B-int4-Chat": { + "DEFAULT": "xverse/XVERSE-65B-Chat-GPTQ-Int4", + "MODELSCOPE": "xverse/XVERSE-65B-Chat-GPTQ-Int4" } }, "template": "xverse" @@ -734,6 +1148,30 @@ "Yi-34B-int4-Chat": { "DEFAULT": "01-ai/Yi-34B-Chat-4bits", "MODELSCOPE": "01ai/Yi-34B-Chat-4bits" + }, + "Yi-1.5-6B": { + "DEFAULT": "01-ai/Yi-1.5-6B", + "MODELSCOPE": "01ai/Yi-1.5-6B" + }, + "Yi-1.5-9B": { + "DEFAULT": "01-ai/Yi-1.5-9B", + "MODELSCOPE": "01ai/Yi-1.5-9B" + }, + "Yi-1.5-34B": { + "DEFAULT": "01-ai/Yi-1.5-34B", + "MODELSCOPE": "01ai/Yi-1.5-34B" + }, + "Yi-1.5-6B-Chat": { + "DEFAULT": "01-ai/Yi-1.5-6B-Chat", + "MODELSCOPE": "01ai/Yi-1.5-6B-Chat" + }, + "Yi-1.5-9B-Chat": { + "DEFAULT": "01-ai/Yi-1.5-9B-Chat", + "MODELSCOPE": "01ai/Yi-1.5-9B-Chat" + }, + "Yi-1.5-34B-Chat": { + "DEFAULT": "01-ai/Yi-1.5-34B-Chat", + "MODELSCOPE": "01ai/Yi-1.5-34B-Chat" } }, "template": "yi" @@ -764,21 +1202,11 @@ "Zephyr-7B-Beta-Chat": { "DEFAULT": "HuggingFaceH4/zephyr-7b-beta", "MODELSCOPE": "modelscope/zephyr-7b-beta" + }, + "Zephyr-141B-ORPO-Chat": { + "DEFAULT": "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1" } }, "template": "zephyr" - }, - { - "models": { - "Atom-7B": { - "DEFAULT": "FlagAlpha/Atom-7B", - "MODELSCOPE": "FlagAlpha/Atom-7B" - }, - "Atom-7B-Chat": { - "DEFAULT": "FlagAlpha/Atom-7B-Chat", - "MODELSCOPE": "FlagAlpha/Atom-7B-Chat" - } - }, - "template": "atom" } ] \ No newline at end of file diff --git a/src/AntSK.LLamaFactory/requirements.txt b/src/AntSK.LLamaFactory/requirements.txt index 700dd79..25f1cf6 100644 --- a/src/AntSK.LLamaFactory/requirements.txt +++ b/src/AntSK.LLamaFactory/requirements.txt @@ -1,10 +1,9 @@ torch>=1.13.1 --index-url https://download.pytorch.org/whl/cu121 -transformers>=4.37.2 -datasets>=2.14.3 -accelerate>=0.27.2 -peft>=0.9.0 -trl>=0.7.11 -gradio>=3.38.0,<4.0.0 +transformers>=4.41.2 +datasets>=2.16.0 +accelerate>=0.30.1 +peft>=0.11.1 +gradio>=4.0.0 scipy einops sentencepiece @@ -13,9 +12,12 @@ uvicorn pydantic fastapi sse-starlette -matplotlib +matplotlib>=3.7.0 fire modelscope langchain-community sentence_transformers FlagEmbedding +packaging +pyyaml +trl>=0.8.6 diff --git a/src/AntSK/Pages/Setting/AIModel/AddModel.razor.cs b/src/AntSK/Pages/Setting/AIModel/AddModel.razor.cs index 42705f7..05af016 100644 --- a/src/AntSK/Pages/Setting/AIModel/AddModel.razor.cs +++ b/src/AntSK/Pages/Setting/AIModel/AddModel.razor.cs @@ -249,6 +249,7 @@ namespace AntSK.Pages.Setting.AIModel _logModalVisible = true; llamaFactoryDic.Value = "true"; _IDics_Repositories.Update(llamaFactoryDic); + _ILLamaFactoryService.LogMessageReceived -= CmdLogHandler; _ILLamaFactoryService.LogMessageReceived += CmdLogHandler; _ILLamaFactoryService.StartLLamaFactory(_aiModel.ModelName, "default"); } @@ -268,6 +269,7 @@ namespace AntSK.Pages.Setting.AIModel if (result == ConfirmResult.Yes) { _logModalVisible = true; + _ILLamaFactoryService.LogMessageReceived -= CmdLogHandler; _ILLamaFactoryService.LogMessageReceived += CmdLogHandler; _ILLamaFactoryService.PipInstall(); }