api_server.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367
  1. # Adapted from openai/api_server.py and tgi-kai-bridge
  2. import argparse
  3. import asyncio
  4. import json
  5. import os
  6. from http import HTTPStatus
  7. from typing import AsyncGenerator, List, Tuple
  8. import fastapi
  9. import uvicorn
  10. from fastapi import APIRouter, Request, Response
  11. from fastapi.middleware.cors import CORSMiddleware
  12. from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
  13. from loguru import logger
  14. from prometheus_client import make_asgi_app
  15. from aphrodite.common.outputs import RequestOutput
  16. from aphrodite.common.sampling_params import _SAMPLING_EPS, SamplingParams
  17. from aphrodite.common.utils import random_uuid
  18. from aphrodite.endpoints.kobold.protocol import KAIGenerationInputSchema
  19. from aphrodite.engine.args_tools import AsyncEngineArgs
  20. from aphrodite.engine.async_aphrodite import AsyncAphrodite
  21. from aphrodite.transformers_utils.tokenizer import get_tokenizer
  22. TIMEOUT_KEEP_ALIVE = 5 # seconds
  23. served_model: str = "Read Only"
  24. engine: AsyncAphrodite = None
  25. gen_cache: dict = {}
  26. app = fastapi.FastAPI()
  27. badwordsids: List[int] = []
  28. # Add prometheus asgi middleware to route /metrics/ requests
  29. metrics_app = make_asgi_app()
  30. app.mount("/metrics/", metrics_app)
  31. def _set_badwords(tokenizer, hf_config): # pylint: disable=redefined-outer-name
  32. global badwordsids
  33. if hf_config.bad_words_ids is not None:
  34. badwordsids = hf_config.bad_words_ids
  35. return
  36. badwordsids = [
  37. v for k, v in tokenizer.get_vocab().items()
  38. if any(c in str(k) for c in "[]")
  39. ]
  40. if tokenizer.pad_token_id in badwordsids:
  41. badwordsids.remove(tokenizer.pad_token_id)
  42. badwordsids.append(tokenizer.eos_token_id)
  43. kai_api = APIRouter()
  44. extra_api = APIRouter()
  45. kobold_lite_ui = ""
  46. app.add_middleware(
  47. CORSMiddleware,
  48. allow_origins=["*"],
  49. allow_credentials=True,
  50. allow_methods=["*"],
  51. allow_headers=["*"],
  52. )
  53. def create_error_response(status_code: HTTPStatus,
  54. message: str) -> JSONResponse:
  55. return JSONResponse({
  56. "msg": message,
  57. "type": "invalid_request_error"
  58. },
  59. status_code=status_code.value)
  60. @app.exception_handler(ValueError)
  61. def validation_exception_handler(request, exc): # pylint: disable=unused-argument
  62. return create_error_response(HTTPStatus.UNPROCESSABLE_ENTITY, str(exc))
  63. def prepare_engine_payload(
  64. kai_payload: KAIGenerationInputSchema
  65. ) -> Tuple[SamplingParams, List[int]]:
  66. """Create SamplingParams and truncated input tokens for AsyncEngine"""
  67. if not kai_payload.genkey:
  68. kai_payload.genkey = f"kai-{random_uuid()}"
  69. if kai_payload.max_context_length > max_model_len:
  70. raise ValueError(
  71. f"max_context_length ({kai_payload.max_context_length}) "
  72. "must be less than or equal to "
  73. f"max_model_len ({max_model_len})")
  74. # KAIspec: top_k == 0 means disabled, aphrodite: top_k == -1 means disabled
  75. # https://github.com/KoboldAI/KoboldAI-Client/wiki/Settings
  76. kai_payload.top_k = kai_payload.top_k if kai_payload.top_k != 0.0 else -1
  77. kai_payload.tfs = max(_SAMPLING_EPS, kai_payload.tfs)
  78. if kai_payload.temperature < _SAMPLING_EPS:
  79. # temp < _SAMPLING_EPS: greedy sampling
  80. kai_payload.n = 1
  81. kai_payload.top_p = 1.0
  82. kai_payload.top_k = -1
  83. if kai_payload.dynatemp_range is not None:
  84. dynatemp_min = kai_payload.temperature - kai_payload.dynatemp_range
  85. dynatemp_max = kai_payload.temperature + kai_payload.dynatemp_range
  86. sampling_params = SamplingParams(
  87. n=kai_payload.n,
  88. best_of=kai_payload.n,
  89. repetition_penalty=kai_payload.rep_pen,
  90. temperature=kai_payload.temperature,
  91. dynatemp_min=dynatemp_min if kai_payload.dynatemp_range > 0 else 0.0,
  92. dynatemp_max=dynatemp_max if kai_payload.dynatemp_range > 0 else 0.0,
  93. dynatemp_exponent=kai_payload.dynatemp_exponent,
  94. smoothing_factor=kai_payload.smoothing_factor,
  95. smoothing_curve=kai_payload.smoothing_curve,
  96. tfs=kai_payload.tfs,
  97. top_p=kai_payload.top_p,
  98. top_k=kai_payload.top_k,
  99. top_a=kai_payload.top_a,
  100. min_p=kai_payload.min_p,
  101. typical_p=kai_payload.typical,
  102. eta_cutoff=kai_payload.eta_cutoff,
  103. epsilon_cutoff=kai_payload.eps_cutoff,
  104. mirostat_mode=kai_payload.mirostat,
  105. mirostat_tau=kai_payload.mirostat_tau,
  106. mirostat_eta=kai_payload.mirostat_eta,
  107. stop=kai_payload.stop_sequence,
  108. include_stop_str_in_output=kai_payload.include_stop_str_in_output,
  109. custom_token_bans=badwordsids
  110. if kai_payload.use_default_badwordsids else [],
  111. max_tokens=kai_payload.max_length,
  112. seed=kai_payload.sampler_seed,
  113. )
  114. max_input_tokens = max(
  115. 1, kai_payload.max_context_length - kai_payload.max_length)
  116. input_tokens = tokenizer(kai_payload.prompt).input_ids[-max_input_tokens:]
  117. return sampling_params, input_tokens
  118. @kai_api.post("/generate")
  119. async def generate(kai_payload: KAIGenerationInputSchema) -> JSONResponse:
  120. """Generate text"""
  121. sampling_params, input_tokens = prepare_engine_payload(kai_payload)
  122. result_generator = engine.generate(None, sampling_params,
  123. kai_payload.genkey, input_tokens)
  124. final_res: RequestOutput = None
  125. previous_output = ""
  126. async for res in result_generator:
  127. final_res = res
  128. new_chunk = res.outputs[0].text[len(previous_output):]
  129. previous_output += new_chunk
  130. gen_cache[kai_payload.genkey] = previous_output
  131. assert final_res is not None
  132. del gen_cache[kai_payload.genkey]
  133. return JSONResponse(
  134. {"results": [{
  135. "text": output.text
  136. } for output in final_res.outputs]})
  137. @extra_api.post("/generate/stream")
  138. async def generate_stream(
  139. kai_payload: KAIGenerationInputSchema) -> StreamingResponse:
  140. """Generate text SSE streaming"""
  141. sampling_params, input_tokens = prepare_engine_payload(kai_payload)
  142. results_generator = engine.generate(None, sampling_params,
  143. kai_payload.genkey, input_tokens)
  144. async def stream_kobold() -> AsyncGenerator[bytes, None]:
  145. previous_output = ""
  146. async for res in results_generator:
  147. new_chunk = res.outputs[0].text[len(previous_output):]
  148. previous_output += new_chunk
  149. yield b"event: message\n"
  150. yield f"data: {json.dumps({'token': new_chunk})}\n\n".encode()
  151. return StreamingResponse(stream_kobold(),
  152. headers={
  153. "Cache-Control": "no-cache",
  154. "Connection": "keep-alive"
  155. },
  156. media_type="text/event-stream")
  157. @extra_api.post("/generate/check")
  158. @extra_api.get("/generate/check")
  159. async def check_generation(request: Request):
  160. """Check outputs in progress (poll streaming)"""
  161. text = ""
  162. try:
  163. request_dict = await request.json()
  164. if "genkey" in request_dict and request_dict["genkey"] in gen_cache:
  165. text = gen_cache[request_dict["genkey"]]
  166. except json.JSONDecodeError:
  167. pass
  168. return JSONResponse({"results": [{"text": text}]})
  169. @extra_api.post("/abort")
  170. async def abort_generation(request: Request):
  171. """Abort running generation"""
  172. try:
  173. request_dict = await request.json()
  174. if "genkey" in request_dict:
  175. await engine.abort(request_dict["genkey"])
  176. except json.JSONDecodeError:
  177. pass
  178. return JSONResponse({})
  179. @extra_api.post("/tokencount")
  180. async def count_tokens(request: Request):
  181. """Tokenize string and return token count"""
  182. request_dict = await request.json()
  183. tokenizer_result = tokenizer(request_dict["prompt"])
  184. return JSONResponse({"value": len(tokenizer_result.input_ids)})
  185. @kai_api.get("/info/version")
  186. async def get_version():
  187. """Impersonate KAI"""
  188. return JSONResponse({"result": "1.2.4"})
  189. @kai_api.get("/model")
  190. async def get_model():
  191. """Get current model"""
  192. return JSONResponse({"result": f"aphrodite/{served_model}"})
  193. @kai_api.get("/config/soft_prompts_list")
  194. async def get_available_softprompts():
  195. """Stub for compatibility"""
  196. return JSONResponse({"values": []})
  197. @kai_api.get("/config/soft_prompt")
  198. async def get_current_softprompt():
  199. """Stub for compatibility"""
  200. return JSONResponse({"value": ""})
  201. @kai_api.put("/config/soft_prompt")
  202. async def set_current_softprompt():
  203. """Stub for compatibility"""
  204. return JSONResponse({})
  205. @kai_api.get("/config/max_length")
  206. async def get_max_length() -> JSONResponse:
  207. """Return the configured max output length"""
  208. max_length = args.max_length
  209. return JSONResponse({"value": max_length})
  210. @kai_api.get("/config/max_context_length")
  211. @extra_api.get("/true_max_context_length")
  212. async def get_max_context_length() -> JSONResponse:
  213. """Return the max context length based on the EngineArgs configuration."""
  214. max_context_length = engine_model_config.max_model_len
  215. return JSONResponse({"value": max_context_length})
  216. @extra_api.get("/preloadstory")
  217. async def get_preloaded_story() -> JSONResponse:
  218. """Stub for compatibility"""
  219. return JSONResponse({})
  220. @extra_api.get("/version")
  221. async def get_extra_version():
  222. """Impersonate KoboldCpp"""
  223. return JSONResponse({"result": "KoboldCpp", "version": "1.55.1"})
  224. @app.get("/")
  225. async def get_kobold_lite_ui():
  226. """Serves a cached copy of the Kobold Lite UI, loading it from disk on
  227. demand if needed."""
  228. # read and return embedded kobold lite
  229. global kobold_lite_ui
  230. if kobold_lite_ui == "":
  231. scriptpath = os.path.dirname(os.path.abspath(__file__))
  232. klitepath = os.path.join(scriptpath, "klite.embd")
  233. if os.path.exists(klitepath):
  234. with open(klitepath, "r") as f:
  235. kobold_lite_ui = f.read()
  236. else:
  237. print("Embedded Kobold Lite not found")
  238. return HTMLResponse(content=kobold_lite_ui)
  239. @app.get("/health")
  240. async def health() -> Response:
  241. """Health check route for K8s"""
  242. return Response(status_code=200)
  243. app.include_router(kai_api, prefix="/api/v1")
  244. app.include_router(kai_api, prefix="/api/latest", include_in_schema=False)
  245. app.include_router(extra_api, prefix="/api/extra")
  246. if __name__ == "__main__":
  247. parser = argparse.ArgumentParser(
  248. description="Aphrodite KoboldAI-Compatible RESTful API server.")
  249. parser.add_argument("--host",
  250. type=str,
  251. default="localhost",
  252. help="host name")
  253. parser.add_argument("--port", type=int, default=2242, help="port number")
  254. parser.add_argument("--served-model-name",
  255. type=str,
  256. default=None,
  257. help="The model name used in the API. If not "
  258. "specified, the model name will be the same as "
  259. "the huggingface name.")
  260. parser.add_argument("--max-length",
  261. type=int,
  262. default=256,
  263. help="The maximum length of the generated text. "
  264. "For use with Kobold Horde.")
  265. parser = AsyncEngineArgs.add_cli_args(parser)
  266. args = parser.parse_args()
  267. logger.debug(f"args: {args}")
  268. logger.warning("The standalone Kobold API is deprecated and will not "
  269. "receive updates. Please use the OpenAI API with the "
  270. "--launch-kobold-api flag instead.")
  271. if args.served_model_name is not None:
  272. served_model = args.served_model_name
  273. else:
  274. served_model = args.model
  275. engine_args = AsyncEngineArgs.from_cli_args(args)
  276. engine = AsyncAphrodite.from_engine_args(engine_args)
  277. engine_model_config = asyncio.run(engine.get_model_config())
  278. max_model_len = engine_model_config.max_model_len
  279. # A separate tokenizer to map token IDs to strings.
  280. tokenizer = get_tokenizer(engine_args.tokenizer,
  281. tokenizer_mode=engine_args.tokenizer_mode,
  282. trust_remote_code=engine_args.trust_remote_code)
  283. _set_badwords(tokenizer, engine_model_config.hf_config)
  284. uvicorn.run(app,
  285. host=args.host,
  286. port=args.port,
  287. log_level="info",
  288. timeout_keep_alive=TIMEOUT_KEEP_ALIVE)