如何评价 LLaMA 模型泄露?

LLM进入开源时代?
关注者
857
被浏览
1,133,554
登录后你可以
不限量看优质回答 私信答主深度交流 精彩内容一键收藏

3月8日更新:使用flask启动模型服务,测试模型性能。

下载地址(点个赞啦)

官方代码地址: GitHub - facebookresearch/llama: Inference code for LLaMA models

所有模型权重: ipfs://Qmb9y5GCkTG7ZzbBWMu2BXwMkzyCKcUjtEKPpgdZ7GEFKm

7B-模型: ipfs://QmbvdJ7KgvZiyaqHw5QtQxRtUd7pCAdkWWbzuvyKusLGTw
13B-模型: ipfs://QmPCfCEERStStjg4kfj3cmCUu1TP7pVQbxdFMwnhpuJtxk
30B-模型: ipfs://QmSD8cxm4zvvnD35KKFu8D9VjXAavNoGWemPW1pQ3AF9ZZ
65B-模型: ipfs://QmdWH379NQu8XoesA8AFw9nKV2MpGR4KohK7WyugadAKTh

补充磁力下载链接:

magnet:?xt=urn:btih:cdee3052d85c697b84f4c1192f43a2276c0daea0&dn=LLaMA

8GB显存就能够跑LLAMA!!!!

项目地址: github.com/juncongmoo/p


你也想测试一下模型,按照这个步骤来

  1. 下载官方代码和模型权重
  2. 新建文件webui.py:
import argparse
from flask import Flask, request, jsonify
from waitress import serve
from typing import Tuple
import os
import sys
import torch
import time
import json
from pathlib import Path
import torch.distributed as dist
from fairscale.nn.model_parallel.initialize import initialize_model_parallel
from llama import ModelArgs, Transformer, Tokenizer, LLaMA
app = Flask(__name__)
def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))
    print(f"local:{local_rank},world:{world_size}")
    dist.init_process_group("nccl")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)
    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size
def load(ckpt_dir: str, tokenizer_path: str, local_rank: int, world_size: int) -> LLaMA:
    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert (
        world_size == len(checkpoints)
    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
    ckpt_path = checkpoints[local_rank]
    print("Loading")
    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())
    model_args: ModelArgs = ModelArgs(max_seq_len=1024, max_batch_size=8, **params)
    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)
    generator = LLaMA(model, tokenizer)
    print(f"Loaded in {time.time() - start_time:.2f} seconds")
    return generator
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--ckpt_dir")
    parser.add_argument("--tokenizer_path", type=str)
    parser.add_argument("--port", type=int,default=7860)
    parser.add_argument("--host", default="0.0.0.0")
    args = parser.parse_args()
    local_rank, world_size = setup_model_parallel()
    # if local_rank > 0:
    #     sys.stdout = open(os.devnull, 'w')
    generator = load(args.ckpt_dir, args.tokenizer_path, local_rank, world_size)
    print(f"LLaMA web-server at host {args.host}  port {args.port+local_rank}")
    #--------------------------------------------------------WHAT I ADDED--------------------------------------------------------------
    # import traceback
    # @app.errorhandler(Exception)
    # def haddle_error(e):
    #     return "Internal Server Error", 500
    @app.route("/infer", methods=["POST"])
    def generate():
        input = request.json