3月8日更新:使用flask启动模型服务,测试模型性能。
下载地址(点个赞啦)
官方代码地址: GitHub - facebookresearch/llama: Inference code for LLaMA models
所有模型权重: ipfs://Qmb9y5GCkTG7ZzbBWMu2BXwMkzyCKcUjtEKPpgdZ7GEFKm
7B-模型: ipfs://QmbvdJ7KgvZiyaqHw5QtQxRtUd7pCAdkWWbzuvyKusLGTw 13B-模型: ipfs://QmPCfCEERStStjg4kfj3cmCUu1TP7pVQbxdFMwnhpuJtxk 30B-模型: ipfs://QmSD8cxm4zvvnD35KKFu8D9VjXAavNoGWemPW1pQ3AF9ZZ 65B-模型: ipfs://QmdWH379NQu8XoesA8AFw9nKV2MpGR4KohK7WyugadAKTh
补充磁力下载链接:
magnet:?xt=urn:btih:cdee3052d85c697b84f4c1192f43a2276c0daea0&dn=LLaMA
8GB显存就能够跑LLAMA!!!!
项目地址: https:// github.com/juncongmoo/p yllama
你也想测试一下模型,按照这个步骤来
import argparse from flask import Flask, request, jsonify from waitress import serve from typing import Tuple import os import sys import torch import time import json from pathlib import Path import torch.distributed as dist from fairscale.nn.model_parallel.initialize import initialize_model_parallel from llama import ModelArgs, Transformer, Tokenizer, LLaMA app = Flask(__name__) def setup_model_parallel() -> Tuple[int, int]: local_rank = int(os.environ.get("LOCAL_RANK", -1)) world_size = int(os.environ.get("WORLD_SIZE", -1)) print(f"local:{local_rank},world:{world_size}") dist.init_process_group("nccl") initialize_model_parallel(world_size) torch.cuda.set_device(local_rank) # seed must be the same in all processes torch.manual_seed(1) return local_rank, world_size def load(ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int) -> LLaMA: start_time = time.time() checkpoints = sorted(Path(ckpt_dir).glob("*.pth")) assert ( world_size == len(checkpoints) ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}" ckpt_path = checkpoints[local_rank] print("Loading") checkpoint = torch.load(ckpt_path, map_location="cpu") with open(Path(ckpt_dir) / "params.json", "r") as f: params = json.loads(f.read()) model_args: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=8, **params) tokenizer = Tokenizer(model_path=tokenizer_path) model_args.vocab_size = tokenizer.n_words torch.set_default_tensor_type(torch.cuda.HalfTensor) model = Transformer(model_args) torch.set_default_tensor_type(torch.FloatTensor) model.load_state_dict(checkpoint, strict=False) generator = LLaMA(model, tokenizer) print(f"Loaded in {time.time() - start_time:.2f} seconds") return generator if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--ckpt_dir") parser.add_argument("--tokenizer_path", type=str) parser.add_argument("--port", type=int,default=7860) parser.add_argument("--host", default="0.0.0.0") args = parser.parse_args() local_rank, world_size = setup_model_parallel() # if local_rank > 0: # sys.stdout = open(os.devnull, 'w') generator = load(args.ckpt_dir, args.tokenizer_path, local_rank, world_size) print(f"LLaMA web-server at host {args.host} port {args.port+local_rank}") #--------------------------------------------------------WHAT I ADDED-------------------------------------------------------------- # import traceback # @app.errorhandler(Exception) # def haddle_error(e): # return "Internal Server Error", 500 @app.route("/infer", methods=["POST"]) def generate(): input = request.json