Răsfoiți Sursa

Merge branch 'THUDM:main' into main

xingyu 1 an în urmă
părinte
comite
73f9469b30

+ 13 - 0
README.md

@@ -87,6 +87,19 @@ python -m vllm.entrypoints.openai.api_server \
      --trust_remote_code
 ```
 
+### Rust-candle
+Codegeex4 now suport Candle framwork [Repo](https://github.com/huggingface/candle/blob/main/candle-examples/examples/codegeex4-9b/README.org)
+#### Build
+Use Rust to launch [codegeex4-all-9b](https://huggingface.co/THUDM/codegeex4-all-9b):
+``` shell
+	cd candle_demo
+	cargo build --release --features cuda # for Cuda
+	cargo build --release # for cpu
+	./target/release/codegeex4-candle --sample-len 512
+```
+
+
+
 ## Tutorials
 CodeGeeX4-ALL-9B provides three user guides to help users quickly understand and use the model:
 

+ 10 - 0
README_zh.md

@@ -89,6 +89,16 @@ python -m vllm.entrypoints.openai.api_server \
      --trust_remote_code
 ```
 
+### Rust-candle
+Codegeex4现已支持Candle框架 [Repo](https://github.com/huggingface/candle/blob/main/candle-examples/examples/codegeex4-9b/README.org)
+#### 构建
+Use Rust to launch [codegeex4-all-9b](https://huggingface.co/THUDM/codegeex4-all-9b):
+``` shell
+	cd candle_demo
+	cargo build --release --features cuda # 使用cuda
+	cargo build --release # 使用cpu
+	./target/release/codegeex4-candle --sample-len 512
+```
 ## 用户指南
 我们为 CodeGeeX4-ALL-9B 提供了用户指南,帮助用户快速了解和使用该模型:
 

+ 2 - 0
candle_demo/.gitignore

@@ -0,0 +1,2 @@
+target
+Cargo.lock

+ 41 - 0
candle_demo/Cargo.toml

@@ -0,0 +1,41 @@
+[package]
+name = "codegeex4-candle"
+version = "0.1.0"
+edition = "2021"
+authors = ["Donjuan Platinum <[email protected]>"]
+license = "GPL-2.0-only"
+description = "Codegeex4"
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+# candle-transformers = {path = "../candle/candle-transformers"}
+# candle-core = {path = "../candle/candle-core"}
+# candle-nn = {path = "../candle/candle-nn"}
+#anyhow = "1.0.86"
+hf-hub = "0.3.2"
+#tokenizer = "0.1.2"
+clap = { version = "4.5.6", features = ["derive"] }
+#tracing-chrome = "0.7.2"
+#candle-examples = {path = "../candle/candle-examples"}
+#tracing-subscriber = "0.3.18"
+tokenizers = "0.19.1"
+serde_json = "1.0.120"
+candle-core = "0.6.0"
+candle-transformers = "0.6.0"
+candle-examples = "0.6.0"
+candle-nn = "0.6.0"
+safetensors = "0.4.3"
+accelerate-src = { version = "0.3.2", optional = true}
+intel-mkl-src = { version = "0.8.1", features = ["mkl-static-lp64-iomp"] ,optional = true}
+rand = "0.8.5"
+owo-colors = "4.0.0"
+#safetensors = {path ="../safetensors/safetensors"}
+[build-dependencies]
+bindgen_cuda = { version = "0.1.1", optional = true }
+
+
+[features]
+default = []
+cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda", "dep:bindgen_cuda"]
+accelerate = ["dep:accelerate-src", "candle-core/accelerate", "candle-nn/accelerate", "candle-transformers/accelerate"]
+mkl = ["dep:intel-mkl-src", "candle-core/mkl", "candle-nn/mkl", "candle-transformers/mkl"]

+ 97 - 0
candle_demo/README.org

@@ -0,0 +1,97 @@
+* candle-codegeex4_9b
+THUDM/CodeGeeX4 is a versatile model for all AI software development scenarios, including code completion, code interpreter, web search, function calling, repository-level Q&A and much more.
+[[../resources/candle_example.png][file:../resources/candle_example.png]]
+
+- [[https://github.com/THUDM/CodeGeeX4][Github]]
+- [[https://codegeex.cn/][HomePage]]
+- [[https://huggingface.co/THUDM/codegeex4-all-9b][huggingface]]  
+- [[https://github.com/huggingface/candle/blob/main/candle-examples/examples/codegeex4-9b/README.org][Candle]]
+** Running with ~cuda~
+
+#+begin_src shell
+  cargo run --example codegeex4-9b --release --features cuda   --  --sample-len 300
+#+end_src
+
+** Running with ~cpu~
+#+begin_src shell
+  cargo run --example codegeex4-9b --release --cpu   --  --sample-len 300
+#+end_src
+
+** Output_Example
+*** Input
+#+begin_src shell
+  cargo run  --release --features cuda --  --sample-len 500 
+#+end_src
+
+*** Output
+#+begin_src shell
+  avx: false, neon: false, simd128: false, f16c: false
+  temp: 0.95 repeat-penalty: 1.10 repeat-last-n: 64
+  cache path /root/autodl-tmp
+  Prompt: [please write a FFT in rust]
+  Using Seed 11511762269791786684
+  DType is BF16
+  transofrmer layers create
+  模型加载完毕 4
+  starting the inference loop
+
+   开始生成
+  samplelen 500
+
+  500 tokens generated (34.60 token/s)
+  Result:
+
+  Sure, I can help you with that. Here's an example of a Fast Fourier Transform (FFT) implementation in Rust:
+
+  ```rust
+  use num_complex::Complex;
+
+  fn fft(input: &[Complex<f64> > ] ) -> Vec<Complex<f64> > > {
+      let n = input.len();
+    
+      if n == 1 {
+	  return vec![input[0]]];
+      }
+    
+      let mut even = vec![];
+      let mut odd = vec![];
+    
+      for i in 0..n {
+
+	      if i % 2 == 0 {
+	      even.push(input[i]);
+	  } else {
+	      odd.push(input[i]);
+	  }
+      }
+    
+      let even_fft = fft(&even);
+      let odd_fft = fft(&odd);
+    
+      let mut output = vec![];
+    
+      for k in 0..n/2 {
+	  let t = Complex::new(0.0, -2.0 * std::f64::consts::PI * (k as f64) / (n as f64))) ).exp();
+        
+	  output.push(even_fft[k] + odd_fft[k] * t]);
+	  output.push(even_fft[k] - odd_fft[k] * t]);
+      }
+    
+      return output;
+  }
+  ```
+
+  This implementation uses the Cooley-Tukey algorithm to perform the FFT. The function takes an array of complex numbers and returns an array of complex numbers which is the result of the FFT.
+#+end_src
+
+
+*  Citation
+#+begin_src
+  @inproceedings{zheng2023codegeex,
+  title={CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Benchmarking on HumanEval-X},
+  author={Qinkai Zheng and Xiao Xia and Xu Zou and Yuxiao Dong and Shan Wang and Yufei Xue and Zihan Wang and Lei Shen and Andi Wang and Yang Li and Teng Su and Zhilin Yang and Jie Tang},
+  booktitle={Proceedings of the 29th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
+  pages={5673--5684},
+  year={2023}
+}
+#+end_src

+ 594 - 0
candle_demo/src/codegeex4.rs

@@ -0,0 +1,594 @@
+use candle_core as candle;
+use candle_core::{DType, Device, IndexOp, Module, Result, Tensor, D};
+use candle_nn::VarBuilder;
+use candle_transformers::models::with_tracing::{linear_b as linear, Linear};
+
+#[derive(Debug, Clone)]
+pub struct Config {
+    pub num_layers: usize,
+    pub padded_vocab_size: usize,
+    pub hidden_size: usize,
+    pub ffn_hidden_size: usize,
+    pub kv_channels: usize,
+    pub num_attention_heads: usize,
+    pub seq_length: usize,
+    pub layernorm_epsilon: f64,
+    pub rmsnorm: bool,
+    pub apply_residual_connection_post_layernorm: bool,
+    pub post_layer_norm: bool,
+    pub add_bias_linear: bool,
+    pub add_qkv_bias: bool,
+    pub bias_dropout_fusion: bool,
+    pub multi_query_attention: bool,
+    pub multi_query_group_num: usize,
+    pub apply_query_key_layer_scaling: bool,
+    pub attention_softmax_in_fp32: bool,
+    pub fp32_residual_connection: bool,
+}
+
+impl Config {
+    pub fn codegeex4() -> Self {
+        Self {
+            num_layers: 40,
+            padded_vocab_size: 151552,
+            hidden_size: 4096,
+            ffn_hidden_size: 13696,
+            kv_channels: 128,
+            num_attention_heads: 32,
+            seq_length: 131072,
+            layernorm_epsilon: 1e-5,
+            rmsnorm: true,
+            apply_residual_connection_post_layernorm: false,
+            post_layer_norm: true,
+            add_bias_linear: false,
+            add_qkv_bias: true,
+            bias_dropout_fusion: true,
+            multi_query_attention: true,
+            multi_query_group_num: 2,
+            apply_query_key_layer_scaling: true,
+            attention_softmax_in_fp32: true,
+            fp32_residual_connection: false,
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct RotaryEmbedding {
+    cache: Tensor,
+}
+
+impl RotaryEmbedding {
+    fn new(cfg: &Config, dtype: DType, dev: &Device) -> Result<Self> {
+        let rotary_dim = cfg.kv_channels;
+        let n_elem = rotary_dim / 2;
+        let inv_freq: Vec<_> = (0..n_elem)
+            .step_by(2)
+            .map(|i| 1f32 / 10_000f64.powf(i as f64 / n_elem as f64) as f32)
+            .collect();
+        let inv_freq_len = inv_freq.len();
+        let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?;
+        let t = Tensor::arange(0u32, cfg.seq_length as u32, dev)?
+            .to_dtype(dtype)
+            .expect("unalbe to dytpe in Rotray Embedding new")
+            .reshape((cfg.seq_length, 1))?;
+        let freqs = t.matmul(&inv_freq)?;
+        let cache = Tensor::stack(&[&freqs.cos()?, &freqs.sin()?], D::Minus1)?;
+        Ok(Self { cache })
+    }
+
+    fn apply(&self, xs: &Tensor, seqlen_offset: usize) -> Result<Tensor> {
+        let (seqlen, _b, np, _hn) = xs.dims4()?;
+        let cache = self.cache.narrow(0, seqlen_offset, seqlen)?;
+        let rot_dim = cache.dim(D::Minus2)? * 2;
+        let (xs, xs_pass) = (
+            xs.narrow(D::Minus1, 0, rot_dim)?,
+            xs.narrow(D::Minus1, rot_dim, rot_dim)?,
+        );
+        let xshaped = xs.reshape((seqlen, (), np, rot_dim / 2, 2))?;
+        let cache = cache.reshape((seqlen, (), 1, rot_dim / 2, 2))?;
+        let (xshaped0, xshaped1) = (
+            xshaped.i((.., .., .., .., 0))?,
+            xshaped.i((.., .., .., .., 1))?,
+        );
+        let (cache0, cache1) = (cache.i((.., .., .., .., 0))?, cache.i((.., .., .., .., 1))?);
+        let xs_out = Tensor::stack(
+            &[
+                (xshaped0.broadcast_mul(&cache0)? - xshaped1.broadcast_mul(&cache1)?)?,
+                (xshaped1.broadcast_mul(&cache0)? + xshaped0.broadcast_mul(&cache1)?)?,
+            ],
+            D::Minus1,
+        )?;
+        let xs_out = xs_out.flatten_from(3)?;
+        Tensor::cat(&[xs_out, xs_pass], D::Minus1)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct CoreAttention {
+    coeff: Option<f64>,
+    norm_factor: f64,
+    dtype: DType,
+}
+
+fn masked_fill(on_false: &Tensor, mask: &Tensor, on_true: f32,dtype:DType) -> Result<Tensor> {
+    let shape = mask.shape();
+    let on_true = Tensor::new(on_true, on_false.device())?.broadcast_as(shape.dims())?;
+    let m = mask.where_cond(&on_true.to_dtype(dtype)?, on_false)?;
+    Ok(m)
+}
+
+impl CoreAttention {
+    fn new(layer_number: usize, cfg: &Config,dtype: DType) -> Result<Self> {
+        let norm_factor = (cfg.kv_channels as f64).sqrt();
+        let (norm_factor, coeff) = if cfg.apply_query_key_layer_scaling {
+            let coeff = f64::max(1.0, layer_number as f64);
+            (norm_factor * coeff, Some(coeff))
+        } else {
+            (norm_factor, None)
+        };
+        Ok(Self { coeff, norm_factor, dtype})
+    }
+
+    fn forward(
+        &self,
+        query_layer: &Tensor,
+        key_layer: &Tensor,
+        value_layer: &Tensor,
+        attention_mask: &Option<Tensor>,
+    ) -> Result<Tensor> {
+        let output_size = (
+            query_layer.dim(1)?, // b
+            query_layer.dim(2)?, // np
+            query_layer.dim(0)?, // sq
+            key_layer.dim(0)?,   // sk
+        );
+        let query_layer =
+            query_layer.reshape((output_size.2, output_size.0 * output_size.1, ()))?;
+        let key_layer = key_layer.reshape((output_size.3, output_size.0 * output_size.1, ()))?;
+        let matmul_result = Tensor::matmul(
+            &query_layer.transpose(0, 1)?.contiguous()?,
+            &key_layer.transpose(0, 1)?.transpose(1, 2)?.contiguous()?,
+        )?;
+        let matmul_result = (matmul_result / self.norm_factor)?.reshape(output_size)?;
+        let matmul_result = match self.coeff {
+            None => matmul_result,
+            Some(coeff) => (matmul_result * coeff)?,
+        };
+        let attention_scores = match attention_mask {
+            Some(mask) => masked_fill(
+                &matmul_result,
+                &mask.broadcast_left((matmul_result.dim(0)?, matmul_result.dim(1)?))?,
+                f32::NEG_INFINITY,
+		self.dtype,
+            )?,
+            None => matmul_result,
+        };
+        let attention_probs = candle_nn::ops::softmax_last_dim(&attention_scores)?;
+
+        let output_size = (
+            value_layer.dim(1)?,
+            value_layer.dim(2)?,
+            query_layer.dim(0)?,
+            value_layer.dim(3)?,
+        );
+        let value_layer =
+            value_layer.reshape((value_layer.dim(0)?, output_size.0 * output_size.1, ()))?;
+        let attention_probs =
+            attention_probs.reshape((output_size.0 * output_size.1, output_size.2, ()))?;
+        let context_layer = Tensor::matmul(&attention_probs.contiguous()?, &value_layer.transpose(0, 1)?.contiguous()?)?;
+        let context_layer = context_layer.reshape(output_size)?;
+        let context_layer = context_layer.permute((2, 0, 1, 3))?.contiguous()?;
+        context_layer.flatten_from(D::Minus2)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct SelfAttention {
+    query_key_value: Linear,
+    core_attention: CoreAttention,
+    dense: Linear,
+    multi_query_attention: bool,
+    num_attention_heads_per_partition: usize,
+    num_multi_query_groups_per_partition: usize,
+    hidden_size_per_attention_head: usize,
+    kv_cache: Option<(Tensor, Tensor)>,
+}
+
+impl SelfAttention {
+    fn new(layer_number: usize, cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let projection_size = cfg.kv_channels * cfg.num_attention_heads;
+        let hidden_size_per_attention_head = projection_size / cfg.num_attention_heads;
+        let qkv_hidden_size = if cfg.multi_query_attention {
+            projection_size + 2 * hidden_size_per_attention_head * cfg.multi_query_group_num
+        } else {
+            3 * projection_size
+        };
+        let query_key_value = linear(
+            cfg.hidden_size,
+            qkv_hidden_size,
+            cfg.add_bias_linear || cfg.add_qkv_bias,
+            vb.pp("query_key_value"),
+        )?;
+        let core_attention = CoreAttention::new(layer_number, cfg,vb.dtype())?;
+        let dense = linear(
+            cfg.hidden_size,
+            cfg.hidden_size,
+            cfg.add_bias_linear,
+            vb.pp("dense"),
+        )?;
+        Ok(Self {
+            query_key_value,
+            core_attention,
+            dense,
+            multi_query_attention: cfg.multi_query_attention,
+            num_attention_heads_per_partition: cfg.num_attention_heads,
+            num_multi_query_groups_per_partition: cfg.multi_query_group_num,
+            hidden_size_per_attention_head: cfg.kv_channels,
+            kv_cache: None,
+        })
+    }
+
+    fn reset_kv_cache(&mut self) {
+        self.kv_cache = None
+    }
+
+    fn forward(
+        &mut self,
+        xs: &Tensor,
+        attention_mask: &Option<Tensor>,
+        rotary_emb: &RotaryEmbedding,
+    ) -> Result<Tensor> {
+        let mixed_x_layer = xs.apply(&self.query_key_value)?;
+        if !self.multi_query_attention {
+            candle::bail!("only multi_query_attention=true is supported")
+        }
+        let hpa = self.hidden_size_per_attention_head;
+        let query_layer =
+            mixed_x_layer.narrow(D::Minus1, 0, self.num_attention_heads_per_partition * hpa)?;
+        let key_layer = mixed_x_layer.narrow(
+            D::Minus1,
+            self.num_attention_heads_per_partition * hpa,
+            self.num_multi_query_groups_per_partition * hpa,
+        )?;
+        let value_layer = mixed_x_layer.narrow(
+            D::Minus1,
+            self.num_attention_heads_per_partition * hpa
+                + self.num_multi_query_groups_per_partition * hpa,
+            self.num_multi_query_groups_per_partition * hpa,
+        )?;
+        let query_layer = query_layer.reshape((
+            query_layer.dim(0)?,
+            query_layer.dim(1)?,
+            self.num_attention_heads_per_partition,
+            hpa,
+        ))?;
+        let key_layer = key_layer.reshape((
+            key_layer.dim(0)?,
+            key_layer.dim(1)?,
+            self.num_multi_query_groups_per_partition,
+            hpa,
+        ))?;
+        let value_layer = value_layer.reshape((
+            value_layer.dim(0)?,
+            value_layer.dim(1)?,
+            self.num_multi_query_groups_per_partition,
+            hpa,
+        ))?;
+
+        // Rotary embeddings.
+        let seqlen_offset = match &self.kv_cache {
+            None => 0,
+            Some((prev_k, _)) => prev_k.dim(0)?,
+        };
+        let query_layer = rotary_emb.apply(&query_layer, seqlen_offset)?;
+        let key_layer = rotary_emb.apply(&key_layer, seqlen_offset)?;
+
+        // KV cache.
+        let (key_layer, value_layer) = match &self.kv_cache {
+            None => (key_layer, value_layer),
+            Some((prev_k, prev_v)) => {
+                let k = Tensor::cat(&[prev_k, &key_layer], 0)?;
+                let v = Tensor::cat(&[prev_v, &value_layer], 0)?;
+                (k, v)
+            }
+        };
+        self.kv_cache = Some((key_layer.clone(), value_layer.clone()));
+
+        // Repeat KV.
+        let ratio =
+            self.num_attention_heads_per_partition / self.num_multi_query_groups_per_partition;
+        let key_layer = {
+            let (d0, d1, d2, d3) = key_layer.dims4()?;
+            key_layer
+                .unsqueeze(D::Minus2)?
+                .expand((d0, d1, d2, ratio, d3))?
+                .reshape((
+                    d0,
+                    d1,
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                ))?
+        };
+        let value_layer = {
+            let (d0, d1, d2, d3) = value_layer.dims4()?;
+            value_layer
+                .unsqueeze(D::Minus2)?
+                .expand((d0, d1, d2, ratio, d3))?
+                .reshape((
+                    d0,
+                    d1,
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                ))?
+        };
+
+        let context_layer =
+            self.core_attention
+                .forward(&query_layer, &key_layer, &value_layer, attention_mask)?;
+        let output = context_layer.apply(&self.dense)?;
+        Ok(output)
+    }
+}
+
+#[allow(clippy::upper_case_acronyms)]
+#[derive(Debug, Clone)]
+struct MLP {
+    dense_h_to_4h: Linear,
+    dense_4h_to_h: Linear,
+}
+
+impl MLP {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let dense_h_to_4h = linear(
+            cfg.hidden_size,
+            cfg.ffn_hidden_size * 2,
+            cfg.add_bias_linear,
+            vb.pp("dense_h_to_4h"),
+        )?;
+        let dense_4h_to_h = linear(
+            cfg.ffn_hidden_size,
+            cfg.hidden_size,
+            cfg.add_bias_linear,
+            vb.pp("dense_4h_to_h"),
+        )?;
+        Ok(Self {
+            dense_4h_to_h,
+            dense_h_to_4h,
+        })
+    }
+}
+
+impl Module for MLP {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        xs.apply(&self.dense_h_to_4h)?
+            .apply(&candle_nn::Activation::Swiglu)?
+            .apply(&self.dense_4h_to_h)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Block {
+    input_layernorm: candle_nn::LayerNorm,
+    self_attention: SelfAttention,
+    post_attention_layernorm: candle_nn::LayerNorm,
+    mlp: MLP,
+    apply_residual_connection_post_layernorm: bool,
+}
+
+impl Block {
+    fn new(layer_number: usize, cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let input_layernorm = if cfg.rmsnorm {
+            candle_nn::rms_norm(
+                cfg.hidden_size,
+                cfg.layernorm_epsilon,
+                vb.pp("input_layernorm"),
+            )?
+            .into_inner()
+        } else {
+            candle_nn::layer_norm(
+                cfg.hidden_size,
+                cfg.layernorm_epsilon,
+                vb.pp("input_layernorm"),
+            )?
+        };
+        let post_attention_layernorm = if cfg.rmsnorm {
+            candle_nn::rms_norm(
+                cfg.hidden_size,
+                cfg.layernorm_epsilon,
+                vb.pp("post_attention_layernorm"),
+            )?
+            .into_inner()
+        } else {
+            candle_nn::layer_norm(
+                cfg.hidden_size,
+                cfg.layernorm_epsilon,
+                vb.pp("post_attention_layernorm"),
+            )?
+        };
+        let self_attention = SelfAttention::new(layer_number, cfg, vb.pp("self_attention"))?;
+        let mlp = MLP::new(cfg, vb.pp("mlp"))?;
+        Ok(Self {
+            input_layernorm,
+            self_attention,
+            post_attention_layernorm,
+            mlp,
+            apply_residual_connection_post_layernorm: cfg.apply_residual_connection_post_layernorm,
+        })
+    }
+
+    fn reset_kv_cache(&mut self) {
+        self.self_attention.reset_kv_cache()
+    }
+
+    fn forward(
+        &mut self,
+        xs: &Tensor,
+        attention_mask: &Option<Tensor>,
+        rotary_emb: &RotaryEmbedding,
+    ) -> Result<Tensor> {
+        let layernorm_output = xs.apply(&self.input_layernorm)?;
+        let attention_output =
+            self.self_attention
+                .forward(&layernorm_output, attention_mask, rotary_emb)?;
+        let residual = if self.apply_residual_connection_post_layernorm {
+            &layernorm_output
+        } else {
+            xs
+        };
+        let layernorm_input = (residual + attention_output)?;
+        let layernorm_output = layernorm_input.apply(&self.post_attention_layernorm)?;
+        let mlp_output = layernorm_output.apply(&self.mlp)?;
+        let residual = if self.apply_residual_connection_post_layernorm {
+            &layernorm_output
+        } else {
+            &layernorm_input
+        };
+        mlp_output + residual
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Transformer {
+    layers: Vec<Block>,
+    final_layernorm: Option<candle_nn::LayerNorm>,
+    rotary_emb: RotaryEmbedding,
+}
+
+impl Transformer {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let vb_l = vb.pp("layers");
+        let mut layers = Vec::with_capacity(cfg.num_layers);
+        println!("transofrmer layers create");
+        let mut count = 0;
+        for layer_index in 0..cfg.num_layers {
+            count += 1;
+            println!("for layer index in {} total is {} ", count, cfg.num_layers);
+            let block = Block::new(layer_index + 1, cfg, vb_l.pp(layer_index))?;
+            layers.push(block)
+        }
+        let final_layernorm = if cfg.post_layer_norm {
+            let ln = if cfg.rmsnorm {
+                candle_nn::rms_norm(
+                    cfg.hidden_size,
+                    cfg.layernorm_epsilon,
+                    vb.pp("final_layernorm"),
+                )?
+                .into_inner()
+            } else {
+                candle_nn::layer_norm(
+                    cfg.hidden_size,
+                    cfg.layernorm_epsilon,
+                    vb.pp("final_layernorm"),
+                )?
+            };
+            Some(ln)
+        } else {
+            None
+        };
+        let rotary_emb = RotaryEmbedding::new(cfg, vb.dtype(), vb.device())?;
+        Ok(Self {
+            layers,
+            final_layernorm,
+            rotary_emb,
+        })
+    }
+
+    fn reset_kv_cache(&mut self) {
+        for block in self.layers.iter_mut() {
+            block.reset_kv_cache()
+        }
+    }
+
+    fn forward(&mut self, xs: &Tensor, attention_mask: &Option<Tensor>) -> Result<Tensor> {
+        let mut xs = xs.clone();
+        for block in self.layers.iter_mut() {
+            xs = block.forward(&xs, attention_mask, &self.rotary_emb)?
+        }
+        match self.final_layernorm.as_ref() {
+            None => Ok(xs),
+            Some(ln) => xs.apply(ln),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Embedding {
+    word_embeddings: candle_nn::Embedding,
+    fp32_residual_connection: bool,
+}
+
+impl Embedding {
+    fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let word_embeddings = candle_nn::embedding(
+            cfg.padded_vocab_size,
+            cfg.hidden_size,
+            vb.pp("word_embeddings"),
+        )?;
+        Ok(Self {
+            word_embeddings,
+            fp32_residual_connection: cfg.fp32_residual_connection,
+        })
+    }
+}
+
+impl Module for Embedding {
+    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
+        let xs = self.word_embeddings.forward(xs)?.transpose(0, 1)?; // b,s,h -> s,b,h
+        if self.fp32_residual_connection {
+            xs.to_dtype(candle::DType::F32)
+        } else {
+            xs.contiguous()
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct Model {
+    embedding: Embedding,
+    encoder: Transformer,
+    output_layer: Linear,
+}
+
+fn get_mask(size: usize, device: &Device) -> Result<Tensor> {
+    let mask: Vec<_> = (0..size)
+        .flat_map(|i| (0..size).map(move |j| u8::from(j > i)))
+        .collect();
+    Tensor::from_slice(&mask, (size, size), device)
+}
+
+impl Model {
+    pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
+        let vb = vb.pp("transformer");
+        let embedding = Embedding::new(cfg, vb.pp("embedding"))?;
+        let encoder = Transformer::new(cfg, vb.pp("encoder"))?;
+        let output_layer = linear(
+            cfg.hidden_size,
+            cfg.padded_vocab_size,
+            false,
+            vb.pp("output_layer"),
+        )?;
+
+        Ok(Self {
+            embedding,
+            encoder,
+            output_layer,
+        })
+    }
+
+    pub fn reset_kv_cache(&mut self) {
+        self.encoder.reset_kv_cache()
+    }
+
+    pub fn forward(&mut self, xs: &Tensor) -> Result<Tensor> {
+        let (_b_size, seq_len) = xs.dims2()?;
+        let input_embeds = xs.apply(&self.embedding)?;
+        let attention_mask = if seq_len <= 1 {
+            None
+        } else {
+            Some(get_mask(seq_len, xs.device())?)
+        };
+        let xs = self.encoder.forward(&input_embeds, &attention_mask)?;
+        let lm_logits = xs.i(seq_len - 1)?.apply(&self.output_layer)?;
+        Ok(lm_logits)
+    }
+}

+ 1 - 0
candle_demo/src/lib.rs

@@ -0,0 +1 @@
+pub mod codegeex4;

+ 284 - 0
candle_demo/src/main.rs

@@ -0,0 +1,284 @@
+#[cfg(feature = "mkl")]
+extern crate intel_mkl_src;
+
+#[cfg(feature = "accelerate")]
+extern crate accelerate_src;
+
+use clap::Parser;
+use codegeex4_candle::codegeex4::*;
+use owo_colors::{self, OwoColorize};
+use std::io::BufRead;
+use std::io::BufReader;
+
+use candle_core as candle;
+use candle_core::{DType, Device, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::generation::LogitsProcessor;
+use hf_hub::{Repo, RepoType};
+use rand::Rng;
+use tokenizers::Tokenizer;
+
+struct TextGeneration {
+    model: Model,
+    device: Device,
+    tokenizer: Tokenizer,
+    logits_processor: LogitsProcessor,
+    repeat_penalty: f32,
+    repeat_last_n: usize,
+    verbose_prompt: bool,
+    dtype: DType,
+}
+
+impl TextGeneration {
+    #[allow(clippy::too_many_arguments)]
+    fn new(
+        model: Model,
+        tokenizer: Tokenizer,
+        seed: u64,
+        temp: Option<f64>,
+        top_p: Option<f64>,
+        repeat_penalty: f32,
+        repeat_last_n: usize,
+        verbose_prompt: bool,
+        device: &Device,
+        dtype: DType,
+    ) -> Self {
+        let logits_processor = LogitsProcessor::new(seed, temp, top_p);
+        Self {
+            model,
+            tokenizer,
+            logits_processor,
+            repeat_penalty,
+            repeat_last_n,
+            verbose_prompt,
+            device: device.clone(),
+            dtype,
+        }
+    }
+
+    fn run(&mut self, sample_len: usize) -> Result<(), ()> {
+        use std::io::Write;
+
+        let stdin = std::io::stdin();
+        let reader = BufReader::new(stdin);
+        // 从标准输入读取prompt
+        for line in reader.lines() {
+            println!("[欢迎使用Codegeex4,请输入prompt]");
+            let line = line.expect("Failed to read line");
+            let tokens = self.tokenizer.encode(line, true).expect("tokens error");
+            if tokens.is_empty() {
+                panic!("Empty prompts are not supported in the chatglm model.")
+            }
+            if self.verbose_prompt {
+                for (token, id) in tokens.get_tokens().iter().zip(tokens.get_ids().iter()) {
+                    let token = token.replace('▁', " ").replace("<0x0A>", "\n");
+                    println!("{id:7} -> '{token}'");
+                }
+            }
+            let eos_token = match self.tokenizer.get_vocab(true).get("<|endoftext|>") {
+                Some(token) => *token,
+                None => panic!("cannot find the endoftext token"),
+            };
+            let mut tokens = tokens.get_ids().to_vec();
+            let mut generated_tokens = 0usize;
+
+            std::io::stdout().flush().expect("output flush error");
+            let start_gen = std::time::Instant::now();
+
+            //            println!("\n 开始生成");
+            println!("samplelen {}", sample_len.blue());
+            let mut result = vec![];
+
+            for index in 0..sample_len {
+                let context_size = if index > 0 { 1 } else { tokens.len() };
+                let ctxt = &tokens[tokens.len().saturating_sub(context_size)..];
+                let input = Tensor::new(ctxt, &self.device)
+                    .unwrap()
+                    .unsqueeze(0)
+                    .expect("create tensor input error");
+                let logits = self.model.forward(&input).unwrap();
+                let logits = logits.squeeze(0).unwrap().to_dtype(self.dtype).unwrap();
+                let logits = if self.repeat_penalty == 1. {
+                    logits
+                } else {
+                    let start_at = tokens.len().saturating_sub(self.repeat_last_n);
+                    candle_transformers::utils::apply_repeat_penalty(
+                        &logits,
+                        self.repeat_penalty,
+                        &tokens[start_at..],
+                    )
+                    .unwrap()
+                };
+
+                let next_token = self.logits_processor.sample(&logits).unwrap();
+                tokens.push(next_token);
+                generated_tokens += 1;
+                if next_token == eos_token {
+                    break;
+                }
+                let token = self
+                    .tokenizer
+                    .decode(&[next_token], true)
+                    .expect("Token error");
+                if self.verbose_prompt {
+                    println!(
+                        "[Index: {}] [Raw Token: {}] [Decode Token: {}]",
+                        index.blue(),
+                        next_token.green(),
+                        token.yellow()
+                    );
+                }
+                result.push(token);
+                std::io::stdout().flush().unwrap();
+            }
+            let dt = start_gen.elapsed();
+            println!(
+                "\n{generated_tokens} tokens generated ({:.2} token/s)",
+                generated_tokens as f64 / dt.as_secs_f64(),
+            );
+            println!("Result:");
+            for tokens in result {
+                print!("{tokens}");
+            }
+        }
+        self.model.reset_kv_cache(); // 清理模型kv
+        Ok(())
+    }
+}
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Run on CPU rather than on GPU.
+    #[arg(name = "cache", short, long, default_value = ".")]
+    cache_path: String,
+
+    #[arg(long)]
+    cpu: bool,
+
+    /// Display the token for the specified prompt.
+    #[arg(long)]
+    verbose_prompt: bool,
+
+    #[arg(long)]
+    prompt: String,
+
+    /// The temperature used to generate samples.
+    #[arg(long)]
+    temperature: Option<f64>,
+
+    /// Nucleus sampling probability cutoff.
+    #[arg(long)]
+    top_p: Option<f64>,
+
+    /// The seed to use when generating random samples.
+    #[arg(long)]
+    seed: Option<u64>,
+
+    /// The length of the sample to generate (in tokens).
+    #[arg(long, short = 'n', default_value_t = 5000)]
+    sample_len: usize,
+
+    #[arg(long)]
+    model_id: Option<String>,
+
+    #[arg(long)]
+    revision: Option<String>,
+
+    #[arg(long)]
+    weight_file: Option<String>,
+
+    #[arg(long)]
+    tokenizer: Option<String>,
+
+    /// Penalty to be applied for repeating tokens, 1. means no penalty.
+    #[arg(long, default_value_t = 1.1)]
+    repeat_penalty: f32,
+
+    /// The context size to consider for the repeat penalty.
+    #[arg(long, default_value_t = 64)]
+    repeat_last_n: usize,
+}
+
+fn main() -> Result<(), ()> {
+    let args = Args::parse();
+    println!(
+        "avx: {}, neon: {}, simd128: {}, f16c: {}",
+        candle::utils::with_avx().red(),
+        candle::utils::with_neon().red(),
+        candle::utils::with_simd128().red(),
+        candle::utils::with_f16c().red(),
+    );
+    println!(
+        "temp: {:.2} repeat-penalty: {:.2} repeat-last-n: {}",
+        args.temperature.unwrap_or(0.95).red(),
+        args.repeat_penalty.red(),
+        args.repeat_last_n.red(),
+    );
+
+    println!("cache path {}", args.cache_path.blue());
+    println!("Prompt: [{}]", args.prompt.green());
+    let mut seed: u64 = 0;
+    if let Some(_seed) = args.seed {
+        seed = _seed;
+    } else {
+        let mut rng = rand::thread_rng();
+        seed = rng.gen();
+    }
+    println!("Using Seed {}", seed.red());
+    let api = hf_hub::api::sync::ApiBuilder::from_cache(hf_hub::Cache::new(args.cache_path.into()))
+        .build()
+        .unwrap();
+
+    let model_id = match args.model_id {
+        Some(model_id) => model_id.to_string(),
+        None => "THUDM/codegeex4-all-9b".to_string(),
+p    };
+    let revision = match args.revision {
+        Some(rev) => rev.to_string(),
+        None => "main".to_string(),
+    };
+    let repo = api.repo(Repo::with_revision(model_id, RepoType::Model, revision));
+    let tokenizer_filename = match args.tokenizer {
+        Some(file) => std::path::PathBuf::from(file),
+        None => api
+            .model("THUDM/codegeex4-all-9b".to_string())
+            .get("tokenizer.json")
+            .unwrap(),
+    };
+    let filenames = match args.weight_file {
+        Some(weight_file) => vec![std::path::PathBuf::from(weight_file)],
+        None => {
+            candle_examples::hub_load_safetensors(&repo, "model.safetensors.index.json").unwrap()
+        }
+    };
+    let tokenizer = Tokenizer::from_file(tokenizer_filename).expect("Tokenizer Error");
+    let start = std::time::Instant::now();
+    let config = Config::codegeex4();
+    let device = candle_examples::device(args.cpu).unwrap();
+    let dtype = if device.is_cuda() {
+        DType::BF16
+    } else {
+        DType::F32
+    };
+    println!("DType is {:?}", dtype.yellow());
+    let vb = unsafe { VarBuilder::from_mmaped_safetensors(&filenames, dtype, &device).unwrap() };
+    let model = Model::new(&config, vb).unwrap();
+
+    println!("模型加载完毕 {:?}", start.elapsed().as_secs().green());
+
+    let mut pipeline = TextGeneration::new(
+        model,
+        tokenizer,
+        seed,
+        args.temperature,
+        args.top_p,
+        args.repeat_penalty,
+        args.repeat_last_n,
+        args.verbose_prompt,
+        &device,
+        dtype,
+    );
+    pipeline.run(args.sample_len)?;
+    Ok(())
+}

+ 159 - 0
function_call_demo/README.md

@@ -0,0 +1,159 @@
+![](../resources/logo.jpeg)
+
+[English](README.md) | [Chinese](README_zh.md)
+
+## Function Call
+
+The CodeGeeX4 model supports 'Function Call', which allows model to select **one** or **multiple** tools from the candidates based on
+the problem.
+
+## Usage Example
+
+### 1. Install Dependencies
+
+```bash
+cd function_call_demo
+pip install -r requirements.txt
+```
+
+### 2. Run the Script
+
+```bash
+python main.py
+>>> [{"name": "weather", "arguments": {"location": "Beijing"}}]
+```
+
+## Explanation
+
+### 1. Single Call from Multiple Tools
+
+In the example script, only one tool is provided as a candidate. However, you can also provide multiple tools as candidates as needed.
+Here is an example:
+
+```python
+tool_content = {
+    "function": [
+        {
+            "name": "weather",
+            "description": "Use for searching weather at a specific location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "description": "the location need to check the weather",
+                        "type": "str",
+                    }
+                },
+                "required": [
+                    "location"
+                ]
+            }
+        },
+        {
+            "name": "Cooking/queryDish",
+            "description": "Cooking API, providing cooking methods for different cuisines. It queries dish information based on specified parameters.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "cuisine": {
+                        "description": "Specify the cuisine to be queried, such as Sichuan cuisine, Cantonese cuisine, Hunan cuisine."
+                    },
+                    "dish": {
+                        "description": "Specify the name of the dish to be queried."
+                    },
+                    "difficulty": {
+                        "description": "Specify the difficulty of the dish to be queried, such as beginner, intermediate, advanced."
+                    }
+                },
+                "required": [
+                    "cuisine"
+                ]
+            }
+        }
+    ]
+}
+response, _ = model.chat(
+    tokenizer,
+    query="How to make Kung-Pao Chicken",
+    history=[{"role": "tool", "content": tool_content}],
+    max_new_tokens=1024,
+    temperature=0.1
+)
+```
+
+The following result is obtained
+
+```text
+[{'name': 'Cooking/queryDish', 'arguments': {'cuisine': 'Sichuan cuisine', 'dish': 'Kung-Pao Chicken'}}]
+```
+
+### 2. Multiple Calls from Multiple Tools
+
+Additionally, for complex problems, the model has the ability to select and call multiple tools from the candidates. Here is an example:
+
+```python
+tool_content = {
+    "function": [
+        {
+            "name": "flight_book",
+            "description": "Book a flight for a specific route and airlines",
+            "parameters": {
+                "type": "dict",
+                "properties": {
+                    "from": {
+                        "type": "string",
+                        "description": "The departure city in full name."
+                    },
+                    "to": {
+                        "type": "string",
+                        "description": "The arrival city in full name."
+                    },
+                    "airlines": {
+                        "type": "string",
+                        "description": "The preferred airline."
+                    }
+                },
+                "required": [
+                    "from",
+                    "to",
+                    "airlines"
+                ]
+            }
+        },
+        {
+            "name": "hotel_book",
+            "description": "Book a hotel for a specific location for the number of nights",
+            "parameters": {
+                "type": "dict",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city where the hotel is located."
+                    },
+                    "nights": {
+                        "type": "integer",
+                        "description": "Number of nights for the stay."
+                    }
+                },
+                "required": [
+                    "location",
+                    "nights"
+                ]
+            }
+        }
+    ]
+}
+response, _ = model.chat(
+    tokenizer,
+    query="Book a flight from Seattle to Boston with American Airlines and book a hotel in Boston for 4 nights.",
+    history=[{"role": "tool", "content": tool_content}],
+    max_new_tokens=1024,
+    temperature=0.1
+)
+```
+
+The following result is obtained
+
+```text
+[{'name': 'flight_book', 'arguments': {'from': 'Seattle', 'to': 'Boston', 'airlines': 'American Airlines'}}, {'name': 'hotel_book', 'arguments': {'location': 'Boston', 'nights': 4}}]
+```

+ 158 - 0
function_call_demo/README_zh.md

@@ -0,0 +1,158 @@
+![](../resources/logo.jpeg)
+
+[English](README.md) | [中文](README_zh.md)
+
+## Function Call
+
+CodeGeeX4代模型支持Function Call,可根据问题,在候选集中选择**一个**或**多个**工具进行调用。
+
+## 使用示例
+
+### 1. 安装依赖项
+
+```bash
+cd function_call_demo
+pip install -r requirements.txt
+```
+
+### 2. 运行脚本
+
+```bash
+python main.py
+>>> [{"name": "weather", "arguments": {"location": "Beijing"}}]
+```
+
+## 说明
+
+### 1.多工具单次调用
+
+示例脚本中,只提供了唯一工具作为候选。但在实际使用时,可根据需要提供多个工具作为候选,例如:
+
+```python
+tool_content = {
+    "function": [
+        {
+            "name": "weather",
+            "description": "Use for searching weather at a specific location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "description": "the location need to check the weather",
+                        "type": "str",
+                    }
+                },
+                "required": [
+                    "location"
+                ]
+            }
+        },
+        {
+            "name": "Cooking/queryDish",
+            "description": "Cooking API, providing cooking methods for different cuisines. It queries dish information based on specified parameters.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "cuisine": {
+                        "description": "Specify the cuisine to be queried, such as Sichuan cuisine, Cantonese cuisine, Hunan cuisine."
+                    },
+                    "dish": {
+                        "description": "Specify the name of the dish to be queried."
+                    },
+                    "difficulty": {
+                        "description": "Specify the difficulty of the dish to be queried, such as beginner, intermediate, advanced."
+                    }
+                },
+                "required": [
+                    "cuisine"
+                ]
+            }
+        }
+    ]
+}
+response, _ = model.chat(
+    tokenizer,
+    query="How to make Kung-Pao Chicken",
+    history=[{"role": "tool", "content": tool_content}],
+    max_new_tokens=1024,
+    temperature=0.1
+)
+```
+
+得到以下结果
+
+```text
+[{'name': 'Cooking/queryDish', 'arguments': {'cuisine': 'Sichuan cuisine', 'dish': 'Kung-Pao Chicken'}}]
+```
+
+### 2.多工具多次调用
+
+此外,针对复杂问题,模型具备在候选工具中选择多个工具进行调用的能力,例如:
+
+```python
+tool_content = {
+    "function": [
+        {
+            "name": "flight_book",
+            "description": "Book a flight for a specific route and airlines",
+            "parameters": {
+                "type": "dict",
+                "properties": {
+                    "from": {
+                        "type": "string",
+                        "description": "The departure city in full name."
+                    },
+                    "to": {
+                        "type": "string",
+                        "description": "The arrival city in full name."
+                    },
+                    "airlines": {
+                        "type": "string",
+                        "description": "The preferred airline."
+                    }
+                },
+                "required": [
+                    "from",
+                    "to",
+                    "airlines"
+                ]
+            }
+        },
+        {
+            "name": "hotel_book",
+            "description": "Book a hotel for a specific location for the number of nights",
+            "parameters": {
+                "type": "dict",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city where the hotel is located."
+                    },
+                    "nights": {
+                        "type": "integer",
+                        "description": "Number of nights for the stay."
+                    }
+                },
+                "required": [
+                    "location",
+                    "nights"
+                ]
+            }
+        }
+    ]
+}
+response, _ = model.chat(
+    tokenizer,
+    query="Book a flight from Seattle to Boston with American Airlines and book a hotel in Boston for 4 nights.",
+    history=[{"role": "tool", "content": tool_content}],
+    max_new_tokens=1024,
+    temperature=0.1
+)
+```
+
+得到以下结果
+
+```text
+[{'name': 'flight_book', 'arguments': {'from': 'Seattle', 'to': 'Boston', 'airlines': 'American Airlines'}}, {'name': 'hotel_book', 'arguments': {'location': 'Boston', 'nights': 4}}]
+```
+

+ 4 - 0
function_call_demo/requirements.txt

@@ -0,0 +1,4 @@
+regex==2024.5.15
+tiktoken==0.7.0
+torch==2.3.1
+transformers==4.39.0

BIN
metric/.DS_Store


BIN
metric/pics/.DS_Store


BIN
metric/pics/Bigcodebench.png


BIN
resources/candle_example.png