From 759ee4c2e159dc4359aab0fb1f1e243b8c174894 Mon Sep 17 00:00:00 2001
From: Michael Platzer <michael.platzer@axelera.ai>
Date: Thu, 23 May 2024 11:51:02 +0200
Subject: [PATCH 1/8] =?UTF-8?q?=F0=9F=A9=B9=20Fix=20result=20of=20not=20eq?=
 =?UTF-8?q?ual=20compare=20w=20signaling=20NaNs=20(#116)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 🩹 Set result bit of not equal compare on signaling NaN

* 💡 Update comment w.r.t. signaling NaNs in compares
---
 src/fpnew_noncomp.sv | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/fpnew_noncomp.sv b/src/fpnew_noncomp.sv
index 8a182617..370e80e9 100644
--- a/src/fpnew_noncomp.sv
+++ b/src/fpnew_noncomp.sv
@@ -257,10 +257,12 @@ module fpnew_noncomp #(
     cmp_result = '0; // false
     cmp_status = '0; // no flags
 
-    // Signalling NaNs always compare as false and are illegal
-    if (signalling_nan) cmp_status.NV = 1'b1; // invalid operation
+    // Signalling NaNs always compare as false (except for "not equal" compares) and are illegal
+    if (signalling_nan) begin
+      cmp_status.NV = 1'b1; // invalid operation
+      cmp_result    = inp_pipe_rnd_mode_q[NUM_INP_REGS] == fpnew_pkg::RDN && inp_pipe_op_mod_q[NUM_INP_REGS];
     // Otherwise do comparisons
-    else begin
+    end else begin
       unique case (inp_pipe_rnd_mode_q[NUM_INP_REGS])
         fpnew_pkg::RNE: begin // Less than or equal
           if (any_operand_nan) cmp_status.NV = 1'b1; // Signalling comparison: NaNs are invalid

From 3bbe483f4d72f9693894304b3db13620e68c6b37 Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Mon, 3 Jun 2024 12:36:16 +0200
Subject: [PATCH 2/8] Add new multi-format DivSqrt unit from openC910 (FP64,
 FP32, FP16 + SIMD) (#8)

* Add new multi-format DivSqrt unit from openC910 supporting FP64, FP32, FP16, and SIMD operations
---
 Bender.yml                                    |   13 +
 README.license.md                             |    2 +-
 README.md                                     |    3 +-
 docs/CHANGELOG-PULP.md                        |    6 +
 docs/README.md                                |   14 +-
 src/fpnew_divsqrt_th_64_multi.sv              |  482 +++++++
 src/fpnew_opgroup_block.sv                    |    4 +-
 src/fpnew_opgroup_multifmt_slice.sv           |   77 +-
 src/fpnew_pkg.sv                              |   16 +
 src/fpnew_top.sv                              |    6 +-
 src_files.yml                                 |   14 +
 vendor/openc910.lock.hjson                    |   14 +
 vendor/openc910.vendor.hjson                  |   47 +
 .../gen_rtl/clk/rtl/gated_clk_cell.v          |   49 +
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v         |  520 ++++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_double.v       |  370 ++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v          |   99 ++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v         |  417 ++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v      |  773 +++++++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_round.v        | 1041 +++++++++++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v    |  323 +++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v          |  691 ++++++++++
 .../rtl/ct_vfdsu_srt_radix16_bound_table.v    | 1168 +++++++++++++++++
 .../rtl/ct_vfdsu_srt_radix16_with_sqrt.v      | 1152 ++++++++++++++++
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_top.v          |  331 +++++
 vendor/openc910/LICENSE                       |  201 +++
 vendor/openc910/README.md                     |   74 ++
 27 files changed, 7880 insertions(+), 27 deletions(-)
 create mode 100644 src/fpnew_divsqrt_th_64_multi.sv
 create mode 100644 vendor/openc910.lock.hjson
 create mode 100644 vendor/openc910.vendor.hjson
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v
 create mode 100644 vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
 create mode 100644 vendor/openc910/LICENSE
 create mode 100644 vendor/openc910/README.md

diff --git a/Bender.yml b/Bender.yml
index fff51ec3..b635aa07 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -25,7 +25,20 @@ sources:
   - vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_dp.v
   - vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_frbus.v
   - vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_src_type.v
+#  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v # same as the one from E906
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
+  - vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
   - src/fpnew_divsqrt_th_32.sv
+  - src/fpnew_divsqrt_th_64_multi.sv
   - src/fpnew_divsqrt_multi.sv
   - src/fpnew_fma.sv
   - src/fpnew_fma_multi.sv
diff --git a/README.license.md b/README.license.md
index ebbb64d3..15c7b69b 100644
--- a/README.license.md
+++ b/README.license.md
@@ -2,4 +2,4 @@
 
 FPnew is released under the *SolderPad Hardware License*, which is a permissive license based on Apache 2.0. Please refer to the [SolderPad license file](LICENSE.solderpad) for further information.
 
-The T-Head E906 DivSqrt unit, integrated into FPnew in [`vendor/opene906`](vendor/opene906), is reseased under the *Apache License, Version 2.0*. Please refer to the [Apache 2.0 license file](LICENSE.apache) for further information.
+The T-Head E906 and C910 DivSqrt units, integrated into FPnew in [`vendor/opene906`](vendor/opene906) and [`vendor/openc910`](vendor/openc910), are reseased under the *Apache License, Version 2.0*. Please refer to the [Apache 2.0 license file](LICENSE.apache) for further information.
diff --git a/README.md b/README.md
index 942d5b86..b13f00d1 100644
--- a/README.md
+++ b/README.md
@@ -88,8 +88,7 @@ It is discouraged to `import` all of `fpnew_pkg` into your source files. Instead
 fpnew_top #(
   .Features       ( fpnew_pkg::RV64D          ),
   .Implementation ( fpnew_pkg::DEFAULT_NOREGS ),
-  .TagType        ( logic                     ),
-  .PulpDivsqrt    ( 1'b1                      )
+  .TagType        ( logic                     )
 ) i_fpnew_top (
   .clk_i,
   .rst_ni,
diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
index dd327b66..cd09eda5 100644
--- a/docs/CHANGELOG-PULP.md
+++ b/docs/CHANGELOG-PULP.md
@@ -7,6 +7,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
 Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
 
+## [pulp-v0.2.0] - 2024-05-29
+
+### Added
+- Add support for alternative multi-format DivSqrt unit (from openC910), supporting FP64, FP32, FP16 and SIMD operations
+- Replace `PulpDivsqrt` top-level parameter with `DivSqrtSel` to choose among the legacy PULP DivSqrt unit (`PULP`), the openE906 DivSqrt (`TH32`), and the openC910 DivSqrt (`THMULTI`). The default choice is set to `THMULTI`
+
 ## [pulp-v0.1.3] - 2023-07-19
 
 ### Fixed
diff --git a/docs/README.md b/docs/README.md
index 542e53e1..dd8a0e9b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -37,6 +37,7 @@ For more in-depth explanations on how to configure the unit and the layout of th
 |------------------|------------------------------------------------------------------------------------------------------------------------------|
 | `Features`       | Specifies the features of the FPU, such as the set of supported formats and operations.                                      |
 | `Implementation` | Allows to control how the above features are implemented, such as the number of pipeline stages and architecture of subunits |
+| `DivSqrtSel`     | Chooses among the three supported DivSqrt units                                                                              |
 | `TagType`        | The SystemVerilog data type of the operation tag                                                                             |
 | `TrueSIMDClass`  | If enabled, the result of a classify operation in vectorial mode will be RISC-V compliant if each output has at least 10 bits|
 | `EnableSIMDMask` | Enable the RISC-V floating-point status flags masking of inactive vectorial lanes. When disabled, `simd_mask_i` is inactive  |
@@ -358,7 +359,18 @@ The configuration  `pipe_config_t` is an enumeration of type `logic [1:0]` holdi
 | `INSIDE`      | All registers are inserted at roughly the middle of the operational unit (if not possible, `BEFORE`) |
 | `DISTRIBUTED` | Registers are evenly distributed to `INSIDE`, `BEFORE`, and `AFTER` (if no `INSIDE`, all `BEFORE`)   |
 
-### `Stochastic Rounding Implementation`
+#### `Division and Square-Root Unit Selection`
+The `DivSqrtSel` parameter is used to choose among the support DivSqrt units.
+It is of type `divsqrt_unit_t`, which is defined as:
+```SystemVerilog
+typedef enum logic[1:0] {
+  PULP,    // "PULP" instantiates the PULP DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8 and SIMD operations
+  TH32,    // "TH32" instantiates the E906 DivSqrt unit supports only FP32 (no SIMD support)
+  THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16 and SIMD operations
+} divsqrt_unit_t;
+```
+
+#### `Stochastic Rounding Implementation`
 
 The `StochasticRndImplementation` parameter is used to configure the RSR support.
 It is of type `rsr_impl_t` which is defined as:
diff --git a/src/fpnew_divsqrt_th_64_multi.sv b/src/fpnew_divsqrt_th_64_multi.sv
new file mode 100644
index 00000000..eff0620d
--- /dev/null
+++ b/src/fpnew_divsqrt_th_64_multi.sv
@@ -0,0 +1,482 @@
+// Copyright 2019 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Authors: Stefan Mach <smach@iis.ee.ethz.ch>
+//          Roman Marquart <maroman@student.ethz.ch>
+
+
+`include "common_cells/registers.svh"
+
+module fpnew_divsqrt_th_64_multi #(
+  parameter fpnew_pkg::fmt_logic_t   FpFmtConfig  = '1,
+  // FPU configuration
+  parameter int unsigned             NumPipeRegs = 0,
+  parameter fpnew_pkg::pipe_config_t PipeConfig  = fpnew_pkg::AFTER,
+  parameter type                     TagType     = logic,
+  parameter type                     AuxType     = logic,
+  // Do not change
+  localparam int unsigned WIDTH       = fpnew_pkg::max_fp_width(FpFmtConfig),
+  localparam int unsigned NUM_FORMATS = fpnew_pkg::NUM_FP_FORMATS
+) (
+  input  logic                        clk_i,
+  input  logic                        rst_ni,
+  // Input signals
+  input  logic [1:0][WIDTH-1:0]       operands_i, // 2 operands
+  input  logic [NUM_FORMATS-1:0][1:0] is_boxed_i, // 2 operands
+  input  fpnew_pkg::roundmode_e       rnd_mode_i,
+  input  fpnew_pkg::operation_e       op_i,
+  input  fpnew_pkg::fp_format_e       dst_fmt_i,
+  input  TagType                      tag_i,
+  input  logic                        mask_i,
+  input  AuxType                      aux_i,
+  input  logic                        vectorial_op_i,
+  // Input Handshake
+  input  logic                        in_valid_i,
+  output logic                        in_ready_o,
+  output logic                        divsqrt_done_o,
+  input  logic                        simd_synch_done_i,
+  output logic                        divsqrt_ready_o,
+  input  logic                        simd_synch_rdy_i,
+  input  logic                        flush_i,
+  // Output signals
+  output logic [WIDTH-1:0]            result_o,
+  output fpnew_pkg::status_t          status_o,
+  output logic                        extension_bit_o,
+  output TagType                      tag_o,
+  output logic                        mask_o,
+  output AuxType                      aux_o,
+  // Output handshake
+  output logic                        out_valid_o,
+  input  logic                        out_ready_i,
+  // Indication of valid data in flight
+  output logic                        busy_o
+);
+
+  // ----------
+  // Constants
+  // ----------
+  // Pipelines
+  localparam NUM_INP_REGS = (PipeConfig == fpnew_pkg::BEFORE)
+                            ? NumPipeRegs
+                            : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                               ? (NumPipeRegs / 2) // Last to get distributed regs
+                               : 0); // no regs here otherwise
+  localparam NUM_OUT_REGS = (PipeConfig == fpnew_pkg::AFTER || PipeConfig == fpnew_pkg::INSIDE)
+                            ? NumPipeRegs
+                            : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                               ? ((NumPipeRegs + 1) / 2) // First to get distributed regs
+                               : 0); // no regs here otherwise
+
+  // ---------------
+  // Input pipeline
+  // ---------------
+  // Selected pipeline output signals as non-arrays
+  logic [1:0][WIDTH-1:0] operands_q;
+  fpnew_pkg::roundmode_e rnd_mode_q;
+  fpnew_pkg::operation_e op_q;
+  fpnew_pkg::fp_format_e dst_fmt_q;
+  logic                  in_valid_q;
+
+  // Input pipeline signals, index i holds signal after i register stages
+  logic                  [0:NUM_INP_REGS][1:0][WIDTH-1:0]       inp_pipe_operands_q;
+  fpnew_pkg::roundmode_e [0:NUM_INP_REGS]                       inp_pipe_rnd_mode_q;
+  fpnew_pkg::operation_e [0:NUM_INP_REGS]                       inp_pipe_op_q;
+  fpnew_pkg::fp_format_e [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
+  TagType                [0:NUM_INP_REGS]                       inp_pipe_tag_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_mask_q;
+  AuxType                [0:NUM_INP_REGS]                       inp_pipe_aux_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_vec_op_q;
+  logic                  [0:NUM_INP_REGS]                       inp_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_INP_REGS] inp_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign inp_pipe_operands_q[0] = operands_i;
+  assign inp_pipe_rnd_mode_q[0] = rnd_mode_i;
+  assign inp_pipe_op_q[0]       = op_i;
+  assign inp_pipe_dst_fmt_q[0]  = dst_fmt_i;
+  assign inp_pipe_tag_q[0]      = tag_i;
+  assign inp_pipe_mask_q[0]     = mask_i;
+  assign inp_pipe_aux_q[0]      = aux_i;
+  assign inp_pipe_vec_op_q[0]   = vectorial_op_i;
+  assign inp_pipe_valid_q[0]    = in_valid_i;
+  // Input stage: Propagate pipeline ready signal to upstream circuitry
+  assign in_ready_o = inp_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipleine ready and a valid data item is present
+    assign reg_ena = inp_pipe_ready[i] & inp_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(inp_pipe_operands_q[i+1], inp_pipe_operands_q[i], reg_ena, '0)
+    `FFL(inp_pipe_rnd_mode_q[i+1], inp_pipe_rnd_mode_q[i], reg_ena, fpnew_pkg::RNE)
+    `FFL(inp_pipe_op_q[i+1],       inp_pipe_op_q[i],       reg_ena, fpnew_pkg::FMADD)
+    `FFL(inp_pipe_dst_fmt_q[i+1],  inp_pipe_dst_fmt_q[i],  reg_ena, fpnew_pkg::fp_format_e'(0))
+    `FFL(inp_pipe_tag_q[i+1],      inp_pipe_tag_q[i],      reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],     inp_pipe_mask_q[i],     reg_ena, '0)
+    `FFL(inp_pipe_aux_q[i+1],      inp_pipe_aux_q[i],      reg_ena, AuxType'('0))
+    `FFL(inp_pipe_vec_op_q[i+1],   inp_pipe_vec_op_q[i],   reg_ena, AuxType'('0))
+  end
+  // Output stage: assign selected pipe outputs to signals for later use
+  assign operands_q = inp_pipe_operands_q[NUM_INP_REGS];
+  assign rnd_mode_q = inp_pipe_rnd_mode_q[NUM_INP_REGS];
+  assign op_q       = inp_pipe_op_q[NUM_INP_REGS];
+  assign dst_fmt_q  = inp_pipe_dst_fmt_q[NUM_INP_REGS];
+  assign in_valid_q = inp_pipe_valid_q[NUM_INP_REGS];
+
+  // -----------------
+  // Input processing
+  // -----------------
+  logic [1:0] divsqrt_fmt;
+
+  // Translate fpnew formats into divsqrt formats
+  if(WIDTH == 64) begin : translate_fmt_64_bits
+    always_comb begin : translate_fmt
+      unique case (dst_fmt_q)
+        fpnew_pkg::FP64:    divsqrt_fmt = 2'b10;
+        fpnew_pkg::FP32:    divsqrt_fmt = 2'b01;
+        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
+        default:            divsqrt_fmt = 2'b10; // 64 bit max width
+      endcase
+    end
+  end else if(WIDTH == 32) begin : translate_fmt_32_bits
+    always_comb begin : translate_fmt
+      unique case (dst_fmt_q)
+        fpnew_pkg::FP32:    divsqrt_fmt = 2'b01;
+        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
+        default:            divsqrt_fmt = 2'b01; // 32 bit max width
+      endcase
+    end
+  end else if(WIDTH == 16) begin : translate_fmt_16_bits
+    always_comb begin : translate_fmt
+      unique case (dst_fmt_q)
+        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
+        default:            divsqrt_fmt = 2'b00; // 16 bit max width
+      endcase
+    end
+  end else begin
+    $fatal(1, "DivSqrt THMULTI: Unsupported WIDTH (the supported width are 64, 32, 16)");
+  end
+
+  // ------------
+  // Control FSM
+  // ------------
+
+  logic in_ready;               // input handshake with upstream
+  logic div_valid, sqrt_valid;  // input signalling with unit
+  logic unit_ready, unit_done, unit_done_q;  // status signals from unit instance
+  logic op_starting;            // high in the cycle a new operation starts
+  logic out_valid, out_ready;   // output handshake with downstream
+  logic unit_busy;              // valid data in flight
+  logic simd_synch_done;
+  // FSM states
+  typedef enum logic [1:0] {IDLE, BUSY, HOLD} fsm_state_e;
+  fsm_state_e state_q, state_d;
+
+  // Valids are gated by the FSM ready. Invalid input ops run a sqrt to not lose illegal instr.
+  assign div_valid   = in_valid_q & (op_q == fpnew_pkg::DIV) & in_ready & ~flush_i;
+  assign sqrt_valid  = in_valid_q & (op_q != fpnew_pkg::DIV) & in_ready & ~flush_i;
+  assign op_starting = div_valid | sqrt_valid;
+
+  // Hold additional information while the operation is in progress
+  
+  TagType result_tag_q;
+  logic result_mask_q;
+  AuxType result_aux_q;
+  logic result_vec_op_q;
+
+  // Fill the registers everytime a valid operation arrives (load FF, active low asynch rst)
+  `FFL(result_tag_q,    inp_pipe_tag_q[NUM_INP_REGS], op_starting, '0)
+  `FFL(result_mask_q,   inp_pipe_mask_q[NUM_INP_REGS],op_starting, '0)
+  `FFL(result_aux_q,    inp_pipe_aux_q[NUM_INP_REGS], op_starting, '0)
+  `FFL(result_vec_op_q, inp_pipe_vec_op_q[NUM_INP_REGS], op_starting, '0)
+
+  // Wait for other lanes only if the operation is vectorial
+  assign simd_synch_done = simd_synch_done_i || ~result_vec_op_q;
+
+  // Valid synch with other lanes
+  // When one divsqrt unit completes an operation, keep its done high, waiting for the other lanes
+  // As soon as all the lanes are over, we can clear this FF and start with a new operation
+  `FFLARNC(unit_done_q, unit_done, unit_done, simd_synch_done, 1'b0, clk_i, rst_ni);
+  // Tell the other units that this unit has finished now or in the past
+  assign divsqrt_done_o = (unit_done_q | unit_done) & result_vec_op_q;
+
+  // Ready synch with other lanes
+  // Bring the FSM-generated ready outside the unit, to synchronize it with the other lanes
+  assign divsqrt_ready_o = in_ready;
+  // Upstream ready comes from sanitization FSM, and it is synched among all the lanes
+  assign inp_pipe_ready[NUM_INP_REGS] = result_vec_op_q ? simd_synch_rdy_i : in_ready;
+
+  // FSM to safely apply and receive data from DIVSQRT unit
+  always_comb begin : flag_fsm
+    // Default assignments
+    in_ready     = 1'b0;
+    out_valid    = 1'b0;
+    unit_busy    = 1'b0;
+    state_d      = state_q;
+
+    unique case (state_q)
+      // Waiting for work
+      IDLE: begin
+        in_ready = 1'b1; // we're ready
+        if (in_valid_q && unit_ready) begin // New work arrives
+          state_d = BUSY; // go into processing state
+        end
+      end
+      // Operation in progress
+      BUSY: begin
+        unit_busy = 1'b1; // data in flight
+        // If all the lanes are done with processing
+        if (simd_synch_done_i || (~result_vec_op_q && unit_done)) begin
+          out_valid = 1'b1; // try to commit result downstream
+          // If downstream accepts our result
+          if (out_ready) begin
+            state_d = IDLE; // we anticipate going back to idling..
+            in_ready = 1'b1; // we acknowledge the instruction
+            if (in_valid_q && unit_ready) begin // ..unless new work comes in
+              state_d  = BUSY; // and stay busy with it
+            end
+          // Otherwise if downstream is not ready for the result
+          end else begin
+            state_d     = HOLD; // wait for the pipeline to take the data
+          end
+        end
+      end
+      // Waiting with valid result for downstream
+      HOLD: begin
+        unit_busy    = 1'b1; // data in flight
+        out_valid    = 1'b1; // try to commit result downstream
+        // If the result is accepted by downstream
+        if (out_ready) begin
+          state_d = IDLE; // go back to idle..
+          if (in_valid_q && unit_ready) begin // ..unless new work comes in
+            in_ready = 1'b1; // acknowledge the new transaction
+            state_d  = BUSY; // will be busy with the next instruction
+          end
+        end
+      end
+      // fall into idle state otherwise
+      default: state_d = IDLE;
+    endcase
+
+    // Flushing overrides the other actions
+    if (flush_i) begin
+      unit_busy = 1'b0; // data is invalidated
+      out_valid = 1'b0; // cancel any valid data
+      state_d   = IDLE; // go to default state
+    end
+  end
+
+  // FSM status register (asynch active low reset)
+  `FF(state_q, state_d, IDLE)
+
+  // -----------------
+  // DIVSQRT instance
+  // -----------------
+  logic [63:0]   unit_result, held_result_q;
+  fpnew_pkg::status_t unit_status, held_status_q;
+  logic               hold_en;
+  
+  logic vfdsu_dp_fdiv_busy;
+  
+  // Regs to save current instruction
+  fpnew_pkg::roundmode_e rm_q;
+  logic[1:0] divsqrt_fmt_q;
+  fpnew_pkg::operation_e divsqrt_op_q;
+  logic div_op, sqrt_op;
+  logic [WIDTH-1:0] srcf0_q, srcf1_q;
+  logic [63:0] srcf0, srcf1;
+  
+  // Save operands in regs, C910 saves all the following information in its regs in the next cycle.
+  `FFL(rm_q, rnd_mode_q, op_starting, fpnew_pkg::RNE)
+  `FFL(divsqrt_fmt_q, divsqrt_fmt, op_starting, '0)
+  `FFL(divsqrt_op_q, op_q, op_starting, fpnew_pkg::DIV)
+  `FFL(srcf0_q, operands_q[0], op_starting, '0)
+  `FFL(srcf1_q, operands_q[1], op_starting, '0)
+
+  // NaN-box inputs with max WIDTH
+  if(WIDTH == 64) begin : gen_fmt_64_bits
+    always_comb begin : NaN_box_inputs
+      if(divsqrt_fmt_q == 2'b10) begin // 64-bit
+        srcf0[63:0] = srcf0_q[63:0];
+        srcf1[63:0] = srcf1_q[63:0];
+      end else if(divsqrt_fmt_q == 2'b01) begin // 32-bit
+        srcf0[63:32] = '1;
+        srcf1[63:32] = '1;
+        srcf0[31:0] = srcf0_q[31:0];
+        srcf1[31:0] = srcf1_q[31:0];
+      end else if(divsqrt_fmt_q == 2'b00) begin //16-bit
+        srcf0[63:16] = '1;
+        srcf1[63:16] = '1;
+        srcf0[15:0] = srcf0_q[15:0];
+        srcf1[15:0] = srcf1_q[15:0];
+      end else begin // Unsupported
+        srcf0[63:0] = '1;
+        srcf1[63:0] = '1;
+      end
+    end
+  end else if (WIDTH == 32) begin : gen_fmt_32_bits
+    always_comb begin : NaN_box_inputs
+      if(divsqrt_fmt_q == 2'b01) begin // 32-bit
+        srcf0[63:32] = '1;
+        srcf1[63:32] = '1;
+        srcf0[31:0] = srcf0_q[31:0];
+        srcf1[31:0] = srcf1_q[31:0];
+      end else if(divsqrt_fmt_q == 2'b00) begin // 16-bit
+        srcf0[63:16] = '1;
+        srcf1[63:16] = '1;
+        srcf0[15:0] = srcf0_q[15:0];
+        srcf1[15:0] = srcf1_q[15:0];
+      end else begin // Unsupported
+        srcf0[63:0] = '1;
+        srcf1[63:0] = '1;
+      end
+    end
+  end else if (WIDTH == 16) begin : gen_fmt_16_bits
+    always_comb begin : NaN_box_inputs
+      if(divsqrt_fmt_q == 2'b00) begin // 16-bit
+        srcf0[63:16] = '1;
+        srcf1[63:16] = '1;
+        srcf0[15:0] = srcf0_q[15:0];
+        srcf1[15:0] = srcf1_q[15:0];
+      end else begin // Unsupported
+        srcf0[63:0] = '1;
+        srcf1[63:0] = '1;
+      end
+    end
+  end else begin
+    $fatal(1, "DivSqrt THMULTI: Unsupported WIDTH (the supported width are 64, 32, 16)");
+  end
+
+  assign div_op = (divsqrt_op_q == fpnew_pkg::DIV) ? 1'b1 : 1'b0;
+  assign sqrt_op = (divsqrt_op_q != fpnew_pkg::DIV) ? 1'b1 : 1'b0;
+  
+  // Select func 1 cycle after div issue
+  logic func_sel;
+  `FFLARNC(func_sel, 1'b1, op_starting, func_sel, 1'b0, clk_i, rst_ni)
+
+  // Select operands 2 cycles after div issue
+  logic op_sel;
+  `FFLARNC(op_sel, 1'b1, func_sel, op_sel, 1'b0, clk_i, rst_ni)
+
+  ct_vfdsu_top i_ct_vfdsu_top (
+    .cp0_vfpu_icg_en                ( 1'b0                      ), // Internal clock gating, (module enable) doesn't matter when the clk_gate module is redundant anyway
+    .cp0_yy_clk_en                  ( 1'b1                      ), // Global clock enable (same as above)
+    .cpurst_b                       ( rst_ni                    ), // Reset
+    .dp_vfdsu_ex1_pipex_dst_ereg    ( '0                        ), // Don't care, used in C910
+    .dp_vfdsu_ex1_pipex_dst_vreg    ( '0                        ), // Don't care, used in C910
+    .dp_vfdsu_ex1_pipex_iid         ( '0                        ), // Don't care, used in C910
+    .dp_vfdsu_ex1_pipex_imm0        ( 3'b111                    ), // Round mode, set to 3'b111 to select vfpu_yy_xx_rm signal
+    .dp_vfdsu_ex1_pipex_sel         ( op_sel                    ), // 3. Select operands, start operation
+    .dp_vfdsu_ex1_pipex_srcf0       ( srcf0                     ), // Input for operand 0
+    .dp_vfdsu_ex1_pipex_srcf1       ( srcf1                     ), // Input for operand 1
+    .dp_vfdsu_fdiv_gateclk_issue    ( 1'b1                      ), // Local clock enable (same as above)
+    .dp_vfdsu_idu_fdiv_issue        ( op_starting               ), // 1. Issue fdiv (FSM in ctrl)
+    .forever_cpuclk                 ( clk_i                     ), // Clock input
+    .idu_vfpu_rf_pipex_func         ( {3'b0, divsqrt_fmt_q, 13'b0 ,sqrt_op, div_op} ), // Defines format (bits 16,15) and operation (bits 1,0)
+    .idu_vfpu_rf_pipex_gateclk_sel  ( func_sel                  ), // 2. Select func
+    .pad_yy_icg_scan_en             ( 1'b0                      ), // SE signal for the redundant clock gating module
+    .rtu_yy_xx_flush                ( flush_i                   ), // Flush
+    .vfpu_yy_xx_dqnan               ( 1'b0                      ), // Disable qNaN, set to 1 if sNaN is used
+    .vfpu_yy_xx_rm                  ( rm_q                      ), // Round mode. redundant if imm0 set to the same
+    .pipex_dp_vfdsu_ereg            (                           ), // Don't care, used by C910
+    .pipex_dp_vfdsu_ereg_data       ( unit_status               ), // Output: status flags
+    .pipex_dp_vfdsu_freg_data       ( unit_result               ), // Output: result
+    .pipex_dp_vfdsu_inst_vld        ( unit_done                 ), // The result is valid
+    .pipex_dp_vfdsu_vreg            (                           ), // Don't care, used by C910
+    .vfdsu_dp_fdiv_busy             ( vfdsu_dp_fdiv_busy        ), // Unit is busy, data in flight
+    .vfdsu_dp_inst_wb_req           (                           ), // Don't care, used by C910
+    .vfdsu_ifu_debug_ex2_wait       (                           ), // Debug output
+    .vfdsu_ifu_debug_idle           (                           ), // Debug output
+    .vfdsu_ifu_debug_pipe_busy      (                           )  // Debug output
+  );
+
+  assign unit_ready = !vfdsu_dp_fdiv_busy;
+
+  // Hold the result when one lane has finished execution, except when all the lanes finish together,
+  // or the operation is not vectorial, and the result can be accepted downstream
+  assign hold_en = unit_done & (~simd_synch_done_i | ~out_ready) & ~(~result_vec_op_q & out_ready);
+  // The Hold register (load, no reset)
+  `FFLNR(held_result_q, unit_result, hold_en, clk_i)
+  `FFLNR(held_status_q, unit_status, hold_en, clk_i)
+
+  // --------------
+  // Output Select
+  // --------------
+  logic [WIDTH-1:0]   result_d;
+  fpnew_pkg::status_t status_d;
+  // Prioritize hold register data
+  assign result_d[WIDTH-1:0] = unit_done_q ? held_result_q[WIDTH-1:0] : unit_result[WIDTH-1:0];
+  assign status_d = unit_done_q ? held_status_q : unit_status;
+
+  // ----------------
+  // Output Pipeline
+  // ----------------
+  // Output pipeline signals, index i holds signal after i register stages
+  logic               [0:NUM_OUT_REGS][WIDTH-1:0] out_pipe_result_q;
+  fpnew_pkg::status_t [0:NUM_OUT_REGS]            out_pipe_status_q;
+  TagType             [0:NUM_OUT_REGS]            out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]            out_pipe_mask_q;
+  AuxType             [0:NUM_OUT_REGS]            out_pipe_aux_q;
+  logic               [0:NUM_OUT_REGS]            out_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_OUT_REGS] out_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign out_pipe_result_q[0] = result_d;
+  assign out_pipe_status_q[0] = status_d;
+  assign out_pipe_tag_q[0]    = result_tag_q;
+  assign out_pipe_mask_q[0]   = result_mask_q;
+  assign out_pipe_aux_q[0]    = result_aux_q;
+  assign out_pipe_valid_q[0]  = out_valid;
+  // Input stage: Propagate pipeline ready signal to inside pipe
+  assign out_ready = out_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipleine ready and a valid data item is present
+    assign reg_ena = out_pipe_ready[i] & out_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
+    `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
+    `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
+    `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
+  end
+  // Output stage: Ready travels backwards from output side, driven by downstream circuitry
+  assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
+  // Output stage: assign module outputs
+  assign result_o        = out_pipe_result_q[NUM_OUT_REGS];
+  assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
+  assign extension_bit_o = 1'b1; // always NaN-Box result
+  assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
+  assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
+  assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
+  assign busy_o          = (| {inp_pipe_valid_q, unit_busy, out_pipe_valid_q});
+endmodule
+
diff --git a/src/fpnew_opgroup_block.sv b/src/fpnew_opgroup_block.sv
index eb3f529e..db2c3032 100644
--- a/src/fpnew_opgroup_block.sv
+++ b/src/fpnew_opgroup_block.sv
@@ -18,7 +18,7 @@ module fpnew_opgroup_block #(
   // FPU configuration
   parameter int unsigned                Width         = 32,
   parameter logic                       EnableVectors = 1'b1,
-  parameter logic                       PulpDivsqrt   = 1'b1,
+  parameter fpnew_pkg::divsqrt_unit_t   DivSqrtSel    = fpnew_pkg::THMULTI,
   parameter fpnew_pkg::fmt_logic_t      FpFmtMask     = '1,
   parameter fpnew_pkg::ifmt_logic_t     IntFmtMask    = '1,
   parameter fpnew_pkg::fmt_unsigned_t   FmtPipeRegs   = '{default: 0},
@@ -183,7 +183,7 @@ module fpnew_opgroup_block #(
       .FpFmtConfig   ( FpFmtMask        ),
       .IntFmtConfig  ( IntFmtMask       ),
       .EnableVectors ( EnableVectors    ),
-      .PulpDivsqrt   ( PulpDivsqrt      ),
+      .DivSqrtSel    ( DivSqrtSel       ),
       .NumPipeRegs   ( REG              ),
       .PipeConfig    ( PipeConfig       ),
       .TagType       ( TagType          ),
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index d135141d..6b5545c5 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -16,17 +16,17 @@
 `include "common_cells/registers.svh"
 
 module fpnew_opgroup_multifmt_slice #(
-  parameter fpnew_pkg::opgroup_e     OpGroup       = fpnew_pkg::CONV,
-  parameter int unsigned             Width         = 64,
+  parameter fpnew_pkg::opgroup_e      OpGroup       = fpnew_pkg::CONV,
+  parameter int unsigned              Width         = 64,
   // FPU configuration
-  parameter fpnew_pkg::fmt_logic_t   FpFmtConfig   = '1,
-  parameter fpnew_pkg::ifmt_logic_t  IntFmtConfig  = '1,
-  parameter logic                    EnableVectors = 1'b1,
-  parameter logic                    PulpDivsqrt   = 1'b1,
-  parameter int unsigned             NumPipeRegs   = 0,
-  parameter fpnew_pkg::pipe_config_t PipeConfig    = fpnew_pkg::BEFORE,
-  parameter type                     TagType       = logic,
-  parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
+  parameter fpnew_pkg::fmt_logic_t    FpFmtConfig   = '1,
+  parameter fpnew_pkg::ifmt_logic_t   IntFmtConfig  = '1,
+  parameter logic                     EnableVectors = 1'b1,
+  parameter fpnew_pkg::divsqrt_unit_t DivSqrtSel    = fpnew_pkg::THMULTI,
+  parameter int unsigned              NumPipeRegs   = 0,
+  parameter fpnew_pkg::pipe_config_t  PipeConfig    = fpnew_pkg::BEFORE,
+  parameter type                      TagType       = logic,
+  parameter fpnew_pkg::rsr_impl_t     StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
   localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS,
@@ -64,11 +64,14 @@ module fpnew_opgroup_multifmt_slice #(
   output logic                                    busy_o
 );
 
-  if ((OpGroup == fpnew_pkg::DIVSQRT) && !PulpDivsqrt &&
-      !((FpFmtConfig[0] == 1) && (FpFmtConfig[1:NUM_FORMATS-1] == '0))) begin
-    $fatal(1, "T-Head-based DivSqrt unit supported only in FP32-only configurations. \
-Set PulpDivsqrt to 1 not to use the PULP DivSqrt unit \
-or set Features.FpFmtMask to support only FP32");
+  if ((OpGroup == fpnew_pkg::DIVSQRT)) begin
+    if ((DivSqrtSel == fpnew_pkg::TH32) && !((FpFmtConfig[0] == 1) && (FpFmtConfig[1:NUM_FORMATS-1] == '0))) begin
+      $fatal(1, "T-Head-based DivSqrt unit supported only in FP32-only configurations. \
+Set DivSqrtSel = THMULTI or DivSqrtSel = PULP to use a multi-format divider");
+    end else if ((DivSqrtSel == fpnew_pkg::THMULTI) && (FpFmtConfig[3] == 1'b1 || FpFmtConfig[4] == 1'b1 || FpFmtConfig[5] == 1'b1)) begin
+      $warning("The DivSqrt unit of C910 (instantiated by DivSqrtSel = THMULTI) does not support \
+FP16alt, FP8, FP8alt. Please use the PULP DivSqrt unit when in need of div/sqrt operations on FP16alt, FP8, FP8alt.");
+    end
   end
 
   if ((OpGroup == fpnew_pkg::DOTP) &&
@@ -82,6 +85,7 @@ or on 16b inputs producing 32b outputs");
   localparam int unsigned MAX_FP_WIDTH   = fpnew_pkg::max_fp_width(FpFmtConfig);
   localparam int unsigned MAX_INT_WIDTH  = fpnew_pkg::max_int_width(IntFmtConfig);
   localparam int unsigned NUM_LANES = fpnew_pkg::max_num_lanes(Width, FpFmtConfig, 1'b1);
+  localparam int unsigned NUM_DIVSQRT_LANES = fpnew_pkg::num_divsqrt_lanes(Width, FpFmtConfig, 1'b1, DivSqrtSel);
   localparam int unsigned NUM_DOTP_LANES = fpnew_pkg::num_dotp_lanes(Width, FpFmtConfig);
   localparam int unsigned NUM_INT_FORMATS = fpnew_pkg::NUM_INT_FORMATS;
   // We will send the format information along with the data
@@ -201,7 +205,8 @@ or on 16b inputs producing 32b outputs");
     logic [LANE_WIDTH-1:0] local_result; // lane-local results
 
     // Generate instances only if needed, lane 0 always generated
-    if ((lane == 0) || (EnableVectors & !(OpGroup == fpnew_pkg::DOTP && (lane >= NUM_DOTP_LANES)))) begin : active_lane
+    if ((lane == 0) || (EnableVectors & (!(OpGroup == fpnew_pkg::DOTP && (lane >= NUM_DOTP_LANES))
+                                        && !(OpGroup == fpnew_pkg::DIVSQRT && (lane >= NUM_DIVSQRT_LANES))))) begin : active_lane
       logic in_valid, out_valid, out_ready; // lane-local handshake
 
       logic [NUM_OPERANDS-1:0][LANE_WIDTH-1:0] local_operands;  // lane-local oprands
@@ -317,7 +322,7 @@ or on 16b inputs producing 32b outputs");
           .busy_o          ( lane_busy[lane]     )
         );
       end else if (OpGroup == fpnew_pkg::DIVSQRT) begin : lane_instance
-        if (!PulpDivsqrt) begin : gen_th_32_divsqrt
+         if (DivSqrtSel == fpnew_pkg::TH32 && LANE_FORMATS[0] && (LANE_FORMATS[1:fpnew_pkg::NUM_FP_FORMATS-1] == '0)) begin : gen_th32_e906_divsqrt
           // The T-head-based DivSqrt unit is supported only in FP32-only configurations
           fpnew_divsqrt_th_32 #(
             .NumPipeRegs ( NumPipeRegs          ),
@@ -347,6 +352,42 @@ or on 16b inputs producing 32b outputs");
             .out_ready_i     ( out_ready           ),
             .busy_o          ( lane_busy[lane]     )
           );
+        end else if(DivSqrtSel == fpnew_pkg::THMULTI) begin : gen_thmulti_c910_divsqrt
+          fpnew_divsqrt_th_64_multi #(
+            .FpFmtConfig ( LANE_FORMATS         ),
+            .NumPipeRegs ( NumPipeRegs          ),
+            .PipeConfig  ( PipeConfig           ),
+            .TagType     ( TagType              ),
+            .AuxType     ( logic [AUX_BITS-1:0] )
+          ) i_fpnew_divsqrt_th_64_c910 (
+           .clk_i,
+            .rst_ni,
+            .operands_i       ( local_operands[1:0] ), // 2 operands
+            .is_boxed_i       ( is_boxed_2op        ), // 2 operands
+            .rnd_mode_i       ( rnd_mode            ),
+            .op_i,
+            .dst_fmt_i,
+            .tag_i,
+            .mask_i           ( simd_mask_i[lane]   ),
+            .aux_i            ( aux_data            ),
+            .vectorial_op_i   ( vectorial_op        ), // synchronize only vectorial operations
+            .in_valid_i       ( in_valid            ),
+            .in_ready_o       ( lane_in_ready[lane] ),
+            .divsqrt_done_o   ( divsqrt_done[lane]  ),
+            .simd_synch_done_i( simd_synch_done     ),
+            .divsqrt_ready_o  ( divsqrt_ready[lane] ),
+            .simd_synch_rdy_i ( simd_synch_rdy      ),
+            .flush_i,
+            .result_o         ( op_result           ),
+            .status_o         ( op_status           ),
+            .extension_bit_o  ( lane_ext_bit[lane]  ),
+            .tag_o            ( lane_tags[lane]     ),
+            .mask_o           ( lane_masks[lane]    ),
+            .aux_o            ( lane_aux[lane]      ),
+            .out_valid_o      ( out_valid           ),
+            .out_ready_i      ( out_ready           ),
+            .busy_o           ( lane_busy[lane]     )
+          );
         end else begin : gen_pulp_divsqrt
           fpnew_divsqrt_multi #(
             .FpFmtConfig ( LANE_FORMATS         ),
@@ -585,7 +626,7 @@ or on 16b inputs producing 32b outputs");
     assign conv_target_q = '0;
   end
 
-  if (PulpDivsqrt && (OpGroup == fpnew_pkg::DIVSQRT)) begin
+  if ((DivSqrtSel != fpnew_pkg::TH32) && (OpGroup == fpnew_pkg::DIVSQRT)) begin
     // Synch lanes if there is more than one
     assign simd_synch_rdy  = EnableVectors ? &divsqrt_ready : divsqrt_ready[0];
     assign simd_synch_done = EnableVectors ? &divsqrt_done  : divsqrt_done[0];
diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv
index 0fc88d68..42d0df6b 100644
--- a/src/fpnew_pkg.sv
+++ b/src/fpnew_pkg.sv
@@ -130,6 +130,15 @@ package fpnew_pkg;
     SDOTP, EXVSUM, VSUM          // DOTP operation group
   } operation_e;
 
+  // -------------
+  // DIVSQRT UNIT
+  // -------------
+  typedef enum logic[1:0] {
+    PULP,    // "PULP" instantiates the PULP DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8 and SIMD operations
+    TH32,    // "TH32" instantiates the E906 DivSqrt unit supports only FP32 (no SIMD support)
+    THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16 and SIMD operations
+  } divsqrt_unit_t;
+
   // -------------------
   // RISC-V FP-SPECIFIC
   // -------------------
@@ -442,6 +451,13 @@ package fpnew_pkg;
     return vec ? width / min_fp_width(cfg) : 1; // if no vectors, only one lane
   endfunction
 
+    // Returns the maximum number of lanes in the FPU according to width, format config and vectors
+  function automatic int unsigned num_divsqrt_lanes(int unsigned width, fmt_logic_t cfg, logic vec, divsqrt_unit_t DivSqrtSel);
+    automatic fmt_logic_t cfg_tmp;
+    cfg_tmp = (DivSqrtSel == THMULTI) ? cfg & 6'b111000 : cfg;
+    return vec ? width / min_fp_width(cfg_tmp) : 1; // if no vectors, only one lane
+  endfunction
+
   // Returns a mask of active FP formats that are present in lane lane_no of a multiformat slice
   function automatic fmt_logic_t get_lane_formats(int unsigned width,
                                                   fmt_logic_t cfg,
diff --git a/src/fpnew_top.sv b/src/fpnew_top.sv
index 9cea0ec1..b564286d 100644
--- a/src/fpnew_top.sv
+++ b/src/fpnew_top.sv
@@ -17,8 +17,8 @@ module fpnew_top #(
   // FPU configuration
   parameter fpnew_pkg::fpu_features_t       Features       = fpnew_pkg::RV64D_Xsflt,
   parameter fpnew_pkg::fpu_implementation_t Implementation = fpnew_pkg::DEFAULT_NOREGS,
-  // PulpDivSqrt = 0 enables T-head-based DivSqrt unit. Supported only for FP32-only instances of Fpnew
-  parameter logic                           PulpDivsqrt    = 1'b1,
+  // DivSqrtSel chooses among PULP, TH32, or THMULTI (see documentation and fpnew_pkg.sv for further details)
+  parameter fpnew_pkg::divsqrt_unit_t       DivSqrtSel     = fpnew_pkg::THMULTI,
   parameter type                            TagType        = logic,
   parameter logic                           TrueSIMDClass  = 1'b0,
   parameter logic                           EnableSIMDMask = 1'b0,
@@ -122,7 +122,7 @@ module fpnew_top #(
       .OpGroup       ( fpnew_pkg::opgroup_e'(opgrp)    ),
       .Width         ( WIDTH                           ),
       .EnableVectors ( Features.EnableVectors          ),
-      .PulpDivsqrt   ( PulpDivsqrt                     ),
+      .DivSqrtSel    ( DivSqrtSel                      ),
       .FpFmtMask     ( Features.FpFmtMask              ),
       .IntFmtMask    ( Features.IntFmtMask             ),
       .FmtPipeRegs   ( Implementation.PipeRegs[opgrp]  ),
diff --git a/src_files.yml b/src_files.yml
index eaf51dd0..84348a98 100644
--- a/src_files.yml
+++ b/src_files.yml
@@ -21,7 +21,20 @@ fpnew:
     vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_dp.v,
     vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_frbus.v,
     vendor/opene906/E906_RTL_FACTORY/gen_rtl/fpu/rtl/pa_fpu_src_type.v,
+#   vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v, # same as the one from E906
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v,
+    vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v,
     src/fpnew_divsqrt_th_32.sv,
+    src/fpnew_divsqrt_th_64_multi.sv,
     src/fpnew_divsqrt_multi.sv,
     src/fpnew_fma.sv,
     src/fpnew_fma_multi.sv,
@@ -32,5 +45,6 @@ fpnew:
     src/fpnew_opgroup_fmt_slice.sv,
     src/fpnew_opgroup_multifmt_slice.sv,
     src/fpnew_rounding.sv,
+    src/lfsr_sr.sv,
     src/fpnew_top.sv,
   ]
diff --git a/vendor/openc910.lock.hjson b/vendor/openc910.lock.hjson
new file mode 100644
index 00000000..64cdb3e8
--- /dev/null
+++ b/vendor/openc910.lock.hjson
@@ -0,0 +1,14 @@
+// Copyright lowRISC contributors.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// This file is generated by the util/vendor script. Please do not modify it
+// manually.
+
+{
+  upstream:
+  {
+    url: https://github.com/T-head-Semi/openc910
+    rev: e0c4ad8ec7f8c70f649d826ebd6c949086453272
+  }
+}
diff --git a/vendor/openc910.vendor.hjson b/vendor/openc910.vendor.hjson
new file mode 100644
index 00000000..ddaa644f
--- /dev/null
+++ b/vendor/openc910.vendor.hjson
@@ -0,0 +1,47 @@
+// Copyright 2023 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+{
+  name: "openc910",
+  target_dir: "openc910"
+
+  upstream: {
+    url: "https://github.com/T-head-Semi/openc910"
+    rev: "e0c4ad8ec7f8c70f649d826ebd6c949086453272"
+  }
+
+  exclude_from_upstream: [
+    "doc",
+    "smart_run",
+    "C910_RTL_FACTORY/gen_rtl/biu",
+    "C910_RTL_FACTORY/gen_rtl/biu/rtl",
+    "C910_RTL_FACTORY/gen_rtl/ciu",
+    "C910_RTL_FACTORY/gen_rtl/clint",
+    "C910_RTL_FACTORY/gen_rtl/clk/rtl/ct_mp_clk_top.v",
+    "C910_RTL_FACTORY/gen_rtl/clk/rtl/ct_clk_top.v",
+    "C910_RTL_FACTORY/gen_rtl/common",
+    "C910_RTL_FACTORY/gen_rtl/cp0",
+    "C910_RTL_FACTORY/gen_rtl/cpu",
+    "C910_RTL_FACTORY/gen_rtl/filelists",
+    "C910_RTL_FACTORY/gen_rtl/fpga",
+    "C910_RTL_FACTORY/gen_rtl/had",
+    "C910_RTL_FACTORY/gen_rtl/idu",
+    "C910_RTL_FACTORY/gen_rtl/ifu",
+    "C910_RTL_FACTORY/gen_rtl/iu",
+    "C910_RTL_FACTORY/gen_rtl/l2c",
+    "C910_RTL_FACTORY/gen_rtl/lsu",
+    "C910_RTL_FACTORY/gen_rtl/mmu",
+    "C910_RTL_FACTORY/gen_rtl/plic",
+    "C910_RTL_FACTORY/gen_rtl/pmp",
+    "C910_RTL_FACTORY/gen_rtl/pmu",
+    "C910_RTL_FACTORY/gen_rtl/rst",
+    "C910_RTL_FACTORY/gen_rtl/rtu",
+    "C910_RTL_FACTORY/gen_rtl/vfalu",
+    "C910_RTL_FACTORY/gen_rtl/vfmau",
+    "C910_RTL_FACTORY/gen_rtl/vfpu",
+    "C910_RTL_FACTORY/gen_rtl/vfpu/rtl",
+    "C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_only_div.v",
+    "C910_RTL_FACTORY/setup"
+  ]
+
+}
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v
new file mode 100644
index 00000000..c7d58ad7
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/clk/rtl/gated_clk_cell.v
@@ -0,0 +1,49 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+module gated_clk_cell(
+  clk_in,
+  global_en,
+  module_en,
+  local_en,
+  external_en,
+  pad_yy_icg_scan_en,
+  clk_out
+);
+
+input  clk_in;
+input  global_en;
+input  module_en;
+input  local_en;
+input  external_en;
+input  pad_yy_icg_scan_en;
+output clk_out;
+
+wire   clk_en_bf_latch;
+wire   SE;
+
+assign clk_en_bf_latch = (global_en && (module_en || local_en)) || external_en ;
+
+// SE driven from primary input, held constant
+assign SE	       = pad_yy_icg_scan_en;
+
+// //   &Connect(    .clk_in           (clk_in), @50
+// //                .SE               (SE), @51
+// //                .external_en      (clk_en_bf_latch), @52
+// //                .clk_out          (clk_out) @53
+// //                ) ; @54
+assign clk_out = clk_in;
+
+endmodule   
\ No newline at end of file
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
new file mode 100644
index 00000000..f7f541f2
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
@@ -0,0 +1,520 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &Depend("cpu_cfig.h"); @22
+// &ModuleBeg; @23
+module ct_vfdsu_ctrl(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  dp_vfdsu_ex1_pipex_sel,
+  dp_vfdsu_fdiv_gateclk_issue,
+  dp_vfdsu_idu_fdiv_issue,
+  ex1_data_clk,
+  ex1_double,
+  ex1_pipedown,
+  ex1_single,
+  ex2_data_clk,
+  ex2_pipedown,
+  ex2_srt_first_round,
+  ex3_data_clk,
+  ex3_pipedown,
+  forever_cpuclk,
+  pad_yy_icg_scan_en,
+  pipex_dp_vfdsu_inst_vld,
+  rtu_yy_xx_flush,
+  srt_ctrl_rem_zero,
+  srt_ctrl_skip_srt,
+  srt_secd_round,
+  srt_sm_on,
+  vfdsu_dp_fdiv_busy,
+  vfdsu_dp_inst_wb_req,
+  vfdsu_ex2_double,
+  vfdsu_ex2_single,
+  vfdsu_ifu_debug_ex2_wait,
+  vfdsu_ifu_debug_idle,
+  vfdsu_ifu_debug_pipe_busy
+);
+
+// &Ports; @24
+input          cp0_vfpu_icg_en;            
+input          cp0_yy_clk_en;              
+input          cpurst_b;                   
+input          dp_vfdsu_ex1_pipex_sel;     
+input          dp_vfdsu_fdiv_gateclk_issue; 
+input          dp_vfdsu_idu_fdiv_issue;    
+input          ex1_double;                 
+input          ex1_single;                 
+input          forever_cpuclk;             
+input          pad_yy_icg_scan_en;         
+input          rtu_yy_xx_flush;            
+input          srt_ctrl_rem_zero;          
+input          srt_ctrl_skip_srt;          
+input          vfdsu_ex2_double;           
+input          vfdsu_ex2_single;           
+output         ex1_data_clk;               
+output         ex1_pipedown;               
+output         ex2_data_clk;               
+output         ex2_pipedown;               
+output         ex2_srt_first_round;        
+output         ex3_data_clk;               
+output         ex3_pipedown;               
+output         pipex_dp_vfdsu_inst_vld;    
+output         srt_secd_round;             
+output         srt_sm_on;                  
+output         vfdsu_dp_fdiv_busy;         
+output         vfdsu_dp_inst_wb_req;       
+output         vfdsu_ifu_debug_ex2_wait;   
+output         vfdsu_ifu_debug_idle;       
+output         vfdsu_ifu_debug_pipe_busy;  
+
+// &Regs; @25
+reg     [3:0]  div_cur_state;              
+reg     [3:0]  div_next_state;             
+reg            ex2_srt_first_round;        
+reg            ex2_srt_secd_round;         
+reg     [4:0]  srt_cnt;                    
+reg            srt_cur_state;              
+reg            srt_nxt_state;              
+reg            vfdsu_ex3_vld;              
+reg            vfdsu_ex4_vld;              
+
+// &Wires; @26
+wire           cp0_vfpu_icg_en;            
+wire           cp0_yy_clk_en;              
+wire           cpurst_b;                   
+wire           div_sm_clk;                 
+wire           div_sm_clk_en;              
+wire           div_st_ex2;                 
+wire           dp_vfdsu_ex1_pipex_sel;     
+wire           dp_vfdsu_fdiv_gateclk_issue; 
+wire           dp_vfdsu_idu_fdiv_issue;    
+wire           ex1_data_clk;               
+wire           ex1_data_clk_en;            
+wire           ex1_double;                 
+wire           ex1_pipedown;               
+wire           ex1_single;                 
+wire           ex2_data_clk;               
+wire           ex2_data_clk_en;            
+wire           ex2_pipe_clk;               
+wire           ex2_pipe_clk_en;            
+wire           ex2_pipedown;               
+wire           ex2_srt_secd_round_pre;     
+wire           ex3_data_clk;               
+wire           ex3_data_clk_en;            
+wire           ex3_pipe_clk;               
+wire           ex3_pipe_clk_en;            
+wire           ex3_pipedown;               
+wire           ex4_pipedown;               
+wire           forever_cpuclk;             
+wire           pad_yy_icg_scan_en;         
+wire           pipex_dp_vfdsu_inst_vld;    
+wire           rtu_yy_xx_flush;            
+wire           skip_srt;                   
+wire    [4:0]  srt_cnt_ini;                
+wire           srt_cnt_zero;               
+wire           srt_ctrl_rem_zero;          
+wire           srt_ctrl_skip_srt;          
+wire           srt_last_round;             
+wire           srt_secd_round;             
+wire           srt_secd_round_pre;         
+wire           srt_sm_clk;                 
+wire           srt_sm_clk_en;              
+wire           srt_sm_on;                  
+wire           vfdsu_dp_fdiv_busy;         
+wire           vfdsu_dp_inst_wb_req;       
+wire           vfdsu_ex2_double;           
+wire           vfdsu_ex2_single;           
+wire           vfdsu_ex2_vld;              
+wire           vfdsu_ifu_debug_ex2_wait;   
+wire           vfdsu_ifu_debug_idle;       
+wire           vfdsu_ifu_debug_pipe_busy;  
+
+
+//==========================================================
+//              EX1 Stage Control Signal
+//==========================================================
+
+//vfdsu ex1 pipedown signal
+assign ex1_pipedown       = dp_vfdsu_ex1_pipex_sel;
+// &Force("output","ex1_pipedown"); @34
+//==========================================================
+//              EX2 Stage Control Signal
+//==========================================================
+//state parameter
+parameter SRT_IDLE = 1'b0;
+parameter SRT_BUSY = 1'b1;
+
+//gate clk
+// &Instance("gated_clk_cell","x_srt_sm_clk"); @43
+gated_clk_cell  x_srt_sm_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (srt_sm_clk        ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (srt_sm_clk_en     ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @44
+//           .clk_out        (srt_sm_clk),//Out Clock @45
+//           .external_en    (1'b0), @46
+//           .global_en      (cp0_yy_clk_en), @47
+//           .local_en       (srt_sm_clk_en),//Local Condition @48
+//           .module_en      (cp0_vfpu_icg_en) @49
+//         ); @50
+assign srt_sm_clk_en = srt_cur_state || 
+                       ex1_pipedown  || 
+                       rtu_yy_xx_flush;
+
+//state machine
+always @(posedge srt_sm_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_cur_state <= SRT_IDLE;
+  else if(rtu_yy_xx_flush)
+    srt_cur_state <= SRT_IDLE;
+  else
+    srt_cur_state <= srt_nxt_state;
+end
+
+// &CombBeg; @66
+always @( ex1_pipedown
+       or srt_last_round
+       or srt_cur_state)
+begin
+case(srt_cur_state)
+SRT_IDLE : if(ex1_pipedown)
+             srt_nxt_state = SRT_BUSY;
+           else
+             srt_nxt_state = SRT_IDLE;
+SRT_BUSY : if(srt_last_round)
+             srt_nxt_state = SRT_IDLE;
+           else
+             srt_nxt_state = SRT_BUSY;
+default  :   srt_nxt_state = SRT_IDLE;
+endcase
+// &CombEnd; @78
+end
+
+//srt sm state
+//assign srt_sm_idle = ~srt_cur_state;
+assign srt_sm_on   =  srt_cur_state;
+// &Force("output","srt_sm_on"); @83
+//state machine control signal
+//srt_last_round on three condition : 
+//  1.srt need not execute
+//  2.srt rem is zero 
+//  3.srt cnt zero
+assign srt_last_round = (skip_srt || 
+                         srt_ctrl_rem_zero || 
+                         srt_cnt_zero)      && 
+                         srt_sm_on;
+assign skip_srt       =  srt_ctrl_skip_srt;
+assign srt_cnt_zero   = ~|srt_cnt[4:0];
+//srt counter
+always @(posedge srt_sm_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_cnt[4:0] <= 5'b0;
+  else if(rtu_yy_xx_flush)
+    srt_cnt[4:0] <= 5'b0;
+  else if(ex1_pipedown)
+    srt_cnt[4:0] <= srt_cnt_ini[4:0];
+  else if(srt_sm_on)
+    srt_cnt[4:0] <= srt_cnt[4:0] - 5'b1;
+  else
+    srt_cnt[4:0] <= srt_cnt[4:0];
+end
+
+//srt_cnt_ini[4:0]
+//For Double, initial is 5'b11100('d28), calculate 29 round
+//For Single, initial is 5'b01110('d14), calculate 15 round
+assign srt_cnt_ini[4:0] = (ex1_double) ? 5'b01101 :
+                           ex1_single  ? 5'b00110
+                                       : 5'b00011;
+
+//vfdsu ex2 pipedown signal
+assign ex2_pipedown = srt_last_round && div_st_ex2;
+// &Force("output","ex2_pipedown"); @157
+// &Force("output","ex2_srt_first_round"); @172
+always @(posedge srt_sm_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    ex2_srt_first_round <= 1'b0;
+  else if(rtu_yy_xx_flush)
+    ex2_srt_first_round <= 1'b0;
+  else if(ex1_pipedown)
+    ex2_srt_first_round <= 1'h1;
+  else
+    ex2_srt_first_round <= 1'b0;
+end
+// &Force("output","ex2_srt_first_round"); @195
+always @(posedge srt_sm_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    ex2_srt_secd_round <= 1'b0;
+  else if(rtu_yy_xx_flush)
+    ex2_srt_secd_round <= 1'b0;
+  else
+    ex2_srt_secd_round <= {1{ex2_srt_secd_round_pre}};
+end
+assign srt_secd_round  = ex2_srt_secd_round;
+
+
+assign ex2_srt_secd_round_pre  = srt_sm_on && srt_secd_round_pre;
+assign srt_secd_round_pre      = vfdsu_ex2_double ? srt_cnt[4:0]==5'b01101 : 
+                                 vfdsu_ex2_single ? srt_cnt[4:0]==5'b00110 : srt_cnt[4:0] == 5'b00011;
+
+//==========================================================
+//              EX3 Stage Control Signal
+//==========================================================
+//gate clk
+// &Instance("gated_clk_cell","x_ex2_pipe_clk"); @217
+gated_clk_cell  x_ex2_pipe_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex2_pipe_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex2_pipe_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @218
+//           .clk_out        (ex2_pipe_clk),//Out Clock @219
+//           .external_en    (1'b0), @220
+//           .global_en      (cp0_yy_clk_en), @221
+//           .local_en       (ex2_pipe_clk_en),//Local Condition @222
+//           .module_en      (cp0_vfpu_icg_en) @223
+//         ); @224
+assign ex2_pipe_clk_en = vfdsu_ex2_vld || 
+                         vfdsu_ex3_vld || 
+                         rtu_yy_xx_flush;
+assign vfdsu_ex2_vld = ex2_pipedown;
+//EX2 to EX3 pipedown
+always @(posedge ex2_pipe_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    vfdsu_ex3_vld <= 1'b0;
+  else if(rtu_yy_xx_flush)
+    vfdsu_ex3_vld <= 1'b0;
+  else if(ex2_pipedown)
+    vfdsu_ex3_vld <= 1'b1;
+  else
+    vfdsu_ex3_vld <= 1'b0;
+end
+assign ex3_pipedown  = vfdsu_ex3_vld;
+// &Force("output","ex3_pipedown"); @242
+
+//==========================================================
+//              EX4 Stage Control Signal
+//==========================================================
+//gate clk
+// &Instance("gated_clk_cell","x_ex3_pipe_clk"); @248
+gated_clk_cell  x_ex3_pipe_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex3_pipe_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex3_pipe_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @249
+//           .clk_out        (ex3_pipe_clk),//Out Clock @250
+//           .external_en    (1'b0), @251
+//           .global_en      (cp0_yy_clk_en), @252
+//           .local_en       (ex3_pipe_clk_en),//Local Condition @253
+//           .module_en      (cp0_vfpu_icg_en) @254
+//         ); @255
+assign ex3_pipe_clk_en = ex3_pipedown || 
+                         vfdsu_ex4_vld || 
+                         rtu_yy_xx_flush;
+
+//EX3 to EX4 pipedown
+always @(posedge ex3_pipe_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    vfdsu_ex4_vld <= 1'b0;
+  else if(rtu_yy_xx_flush)
+    vfdsu_ex4_vld <= 1'b0;
+  else if(ex3_pipedown)
+    vfdsu_ex4_vld <= 1'b1;
+  else
+    vfdsu_ex4_vld <= 1'b0;
+end
+assign ex4_pipedown = vfdsu_ex4_vld;
+
+
+//Div Write Back State Machine
+parameter IDLE      = 4'b0000;
+parameter RF        = 4'b0100;
+parameter EX1       = 4'b0101;
+parameter EX2       = 4'b0110;
+parameter WB_REQ    = 4'b0111;
+parameter WB        = 4'b1000;
+
+//GateClk
+// &Instance("gated_clk_cell","x_div_sm_clk"); @284
+gated_clk_cell  x_div_sm_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (div_sm_clk        ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (div_sm_clk_en     ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @285
+//           .clk_out        (div_sm_clk),//Out Clock @286
+//           .external_en    (1'b0), @287
+//           .global_en      (cp0_yy_clk_en), @288
+//           .local_en       (div_sm_clk_en),//Local Condition @289
+//           .module_en      (cp0_vfpu_icg_en) @290
+//         ); @291
+assign div_sm_clk_en = dp_vfdsu_fdiv_gateclk_issue || 
+                       !(div_cur_state[3:0] == IDLE);
+//State Trans
+always @(posedge div_sm_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    div_cur_state[3:0] <= IDLE;
+  else if(rtu_yy_xx_flush)
+    div_cur_state[3:0] <= IDLE;
+  else
+    div_cur_state[3:0] <= div_next_state[3:0];
+end
+// &CombBeg; @304
+always @( dp_vfdsu_idu_fdiv_issue
+       or dp_vfdsu_ex1_pipex_sel
+       or ex4_pipedown
+       or srt_last_round
+       or div_cur_state[3:0])
+begin
+  case(div_cur_state[3:0])
+  IDLE       : if(dp_vfdsu_idu_fdiv_issue)
+                 div_next_state[3:0] = RF;
+               else
+                 div_next_state[3:0] = IDLE;
+  RF         : div_next_state[3:0] = EX1;
+  EX1        : if(dp_vfdsu_ex1_pipex_sel) 
+                  div_next_state[3:0] = EX2;
+               else
+                 div_next_state[3:0] = IDLE;
+  EX2        : if(srt_last_round)
+                 div_next_state[3:0] = WB_REQ;
+               else 
+                 div_next_state[3:0] = EX2;
+  WB_REQ   :   if(ex4_pipedown)
+                 div_next_state[3:0] = WB;
+               else
+                 div_next_state[3:0] = WB_REQ;
+  WB         : if(dp_vfdsu_idu_fdiv_issue)
+                 div_next_state[3:0] = RF;
+               else
+                 div_next_state[3:0] = IDLE;
+  default    :   div_next_state[3:0] = IDLE;
+  endcase
+// &CombEnd; @329
+end
+//Control Signal
+assign div_st_ex2             = (div_cur_state[3:0] == EX2);
+
+//Div Rdy Signal
+//assign vfdsu_vfpu_gateclk_en   = div_cur_state[2] || div_cur_state[3] || 
+//                                 ex4_pipedown;
+
+
+//Active Data with VFPU
+//GateClk
+// &Instance("gated_clk_cell","x_ex1_data_clk"); @340
+gated_clk_cell  x_ex1_data_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex1_data_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex1_data_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @341
+//           .clk_out        (ex1_data_clk),//Out Clock @342
+//           .external_en    (1'b0), @343
+//           .global_en      (cp0_yy_clk_en), @344
+//           .local_en       (ex1_data_clk_en),//Local Condition @345
+//           .module_en      (cp0_vfpu_icg_en) @346
+//         ); @347
+assign ex1_data_clk_en = ex1_pipedown; 
+
+// &Instance("gated_clk_cell","x_ex2_data_clk"); @350
+gated_clk_cell  x_ex2_data_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex2_data_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex2_data_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @351
+//           .clk_out        (ex2_data_clk),//Out Clock @352
+//           .external_en    (1'b0), @353
+//           .global_en      (cp0_yy_clk_en), @354
+//           .local_en       (ex2_data_clk_en),//Local Condition @355
+//           .module_en      (cp0_vfpu_icg_en) @356
+//         ); @357
+assign ex2_data_clk_en = ex2_pipedown;
+
+// &Instance("gated_clk_cell","x_ex3_data_clk"); @360
+gated_clk_cell  x_ex3_data_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex3_data_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex3_data_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @361
+//           .clk_out        (ex3_data_clk),//Out Clock @362
+//           .external_en    (1'b0), @363
+//           .global_en      (cp0_yy_clk_en), @364
+//           .local_en       (ex3_data_clk_en),//Local Condition @365
+//           .module_en      (cp0_vfpu_icg_en) @366
+//         ); @367
+assign ex3_data_clk_en = ex3_pipedown;
+
+assign pipex_dp_vfdsu_inst_vld           = div_cur_state[3:0] == WB;
+// this is used to apply write back port
+assign vfdsu_dp_inst_wb_req   = vfdsu_ex3_vld; 
+assign vfdsu_dp_fdiv_busy     = div_cur_state[2];
+
+//Debug infor
+assign vfdsu_ifu_debug_ex2_wait  = 1'b0;
+assign vfdsu_ifu_debug_idle      = (div_cur_state[3:0] == IDLE);
+assign vfdsu_ifu_debug_pipe_busy = 1'b0;
+
+
+// &ModuleEnd; @381
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
new file mode 100644
index 00000000..b57e289e
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
@@ -0,0 +1,370 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &Depend("cpu_cfig.h"); @22
+// &ModuleBeg; @23
+module ct_vfdsu_double(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  ex1_div,
+  ex1_double,
+  ex1_pipedown,
+  ex1_scalar,
+  ex1_single,
+  ex1_sqrt,
+  ex1_src0,
+  ex1_src1,
+  ex1_static_rm,
+  ex2_pipedown,
+  ex2_srt_first_round,
+  ex3_pipedown,
+  ex4_out_expt,
+  ex4_out_result,
+  forever_cpuclk,
+  pad_yy_icg_scan_en,
+  srt_ctrl_rem_zero,
+  srt_ctrl_skip_srt,
+  srt_secd_round,
+  srt_sm_on,
+  vfpu_yy_xx_dqnan,
+  vfpu_yy_xx_rm
+);
+
+// &Ports; @24
+input           cp0_vfpu_icg_en;                      
+input           cp0_yy_clk_en;                        
+input           cpurst_b;                             
+input           ex1_div;                              
+input           ex1_double;                           
+input           ex1_pipedown;                         
+input           ex1_scalar;                           
+input           ex1_single;                           
+input           ex1_sqrt;                             
+input   [63:0]  ex1_src0;                             
+input   [63:0]  ex1_src1;                             
+input   [2 :0]  ex1_static_rm;                        
+input           ex2_pipedown;                         
+input           ex2_srt_first_round;                  
+input           ex3_pipedown;                         
+input           forever_cpuclk;                       
+input           pad_yy_icg_scan_en;                   
+input           srt_secd_round;                       
+input           srt_sm_on;                            
+input           vfpu_yy_xx_dqnan;                     
+input   [2 :0]  vfpu_yy_xx_rm;                        
+output  [4 :0]  ex4_out_expt;                         
+output  [63:0]  ex4_out_result;                       
+output          srt_ctrl_rem_zero;                    
+output          srt_ctrl_skip_srt;                    
+
+// &Regs; @25
+
+// &Wires; @26
+wire            cp0_vfpu_icg_en;                      
+wire            cp0_yy_clk_en;                        
+wire            cpurst_b;                             
+wire            ex1_div;                              
+wire    [52:0]  ex1_divisor;                          
+wire            ex1_double;                           
+wire            ex1_pipedown;                         
+wire    [59:0]  ex1_remainder;                        
+wire            ex1_scalar;                           
+wire            ex1_single;                           
+wire            ex1_sqrt;                             
+wire    [63:0]  ex1_src0;                             
+wire    [63:0]  ex1_src1;                             
+wire    [2 :0]  ex1_static_rm;                        
+wire            ex2_pipedown;                         
+wire            ex2_srt_first_round;                  
+wire            ex3_pipedown;                         
+wire    [4 :0]  ex4_out_expt;                         
+wire    [63:0]  ex4_out_result;                       
+wire            forever_cpuclk;                       
+wire            pad_yy_icg_scan_en;                   
+wire            srt_ctrl_rem_zero;                    
+wire            srt_ctrl_skip_srt;                    
+wire            srt_secd_round;                       
+wire            srt_sm_on;                            
+wire    [57:0]  total_qt_rt_58;                       
+wire            vfdsu_ex2_div;                        
+wire            vfdsu_ex2_double;                     
+wire            vfdsu_ex2_dz;                         
+wire    [12:0]  vfdsu_ex2_expnt_add0;                 
+wire    [12:0]  vfdsu_ex2_expnt_add1;                 
+wire            vfdsu_ex2_nv;                         
+wire            vfdsu_ex2_of_rm_lfn;                  
+wire            vfdsu_ex2_op0_norm;                   
+wire            vfdsu_ex2_op1_norm;                   
+wire    [51:0]  vfdsu_ex2_qnan_f;                     
+wire            vfdsu_ex2_qnan_sign;                  
+wire            vfdsu_ex2_result_inf;                 
+wire            vfdsu_ex2_result_qnan;                
+wire            vfdsu_ex2_result_sign;                
+wire            vfdsu_ex2_result_zero;                
+wire    [2 :0]  vfdsu_ex2_rm;                         
+wire            vfdsu_ex2_single;                     
+wire            vfdsu_ex2_sqrt;                       
+wire            vfdsu_ex2_srt_skip;                   
+wire    [12:0]  vfdsu_ex3_doub_expnt_rst;             
+wire            vfdsu_ex3_double;                     
+wire            vfdsu_ex3_dz;                         
+wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
+wire            vfdsu_ex3_id_srt_skip;                
+wire            vfdsu_ex3_nv;                         
+wire            vfdsu_ex3_of;                         
+wire            vfdsu_ex3_potnt_of;                   
+wire            vfdsu_ex3_potnt_uf;                   
+wire    [51:0]  vfdsu_ex3_qnan_f;                     
+wire            vfdsu_ex3_qnan_sign;                  
+wire            vfdsu_ex3_rem_sign;                   
+wire            vfdsu_ex3_rem_zero;                   
+wire    [52:0]  vfdsu_ex3_result_denorm_round_add_num; 
+wire            vfdsu_ex3_result_inf;                 
+wire            vfdsu_ex3_result_lfn;                 
+wire            vfdsu_ex3_result_qnan;                
+wire            vfdsu_ex3_result_sign;                
+wire            vfdsu_ex3_result_zero;                
+wire    [2 :0]  vfdsu_ex3_rm;                         
+wire            vfdsu_ex3_rslt_denorm;                
+wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+wire            vfdsu_ex3_single;                     
+wire            vfdsu_ex3_uf;                         
+wire            vfdsu_ex4_denorm_to_tiny_frac;        
+wire            vfdsu_ex4_double;                     
+wire            vfdsu_ex4_dz;                         
+wire    [12:0]  vfdsu_ex4_expnt_rst;                  
+wire    [54:0]  vfdsu_ex4_frac;                       
+wire            vfdsu_ex4_nv;                         
+wire            vfdsu_ex4_nx;                         
+wire            vfdsu_ex4_of;                         
+wire            vfdsu_ex4_of_rst_lfn;                 
+wire    [1 :0]  vfdsu_ex4_potnt_norm;                 
+wire            vfdsu_ex4_potnt_of;                   
+wire            vfdsu_ex4_potnt_uf;                   
+wire    [51:0]  vfdsu_ex4_qnan_f;                     
+wire            vfdsu_ex4_qnan_sign;                  
+wire            vfdsu_ex4_result_inf;                 
+wire            vfdsu_ex4_result_lfn;                 
+wire            vfdsu_ex4_result_nor;                 
+wire            vfdsu_ex4_result_qnan;                
+wire            vfdsu_ex4_result_sign;                
+wire            vfdsu_ex4_result_zero;                
+wire            vfdsu_ex4_rslt_denorm;                
+wire            vfdsu_ex4_single;                     
+wire            vfdsu_ex4_uf;                         
+wire            vfpu_yy_xx_dqnan;                     
+wire    [2 :0]  vfpu_yy_xx_rm;                        
+
+
+// &Instance("ct_vfdsu_prepare"); @28
+ct_vfdsu_prepare  x_ct_vfdsu_prepare (
+  .cp0_vfpu_icg_en       (cp0_vfpu_icg_en      ),
+  .cp0_yy_clk_en         (cp0_yy_clk_en        ),
+  .cpurst_b              (cpurst_b             ),
+  .ex1_div               (ex1_div              ),
+  .ex1_divisor           (ex1_divisor          ),
+  .ex1_double            (ex1_double           ),
+  .ex1_pipedown          (ex1_pipedown         ),
+  .ex1_remainder         (ex1_remainder        ),
+  .ex1_scalar            (ex1_scalar           ),
+  .ex1_single            (ex1_single           ),
+  .ex1_sqrt              (ex1_sqrt             ),
+  .ex1_src0              (ex1_src0             ),
+  .ex1_src1              (ex1_src1             ),
+  .ex1_static_rm         (ex1_static_rm        ),
+  .forever_cpuclk        (forever_cpuclk       ),
+  .pad_yy_icg_scan_en    (pad_yy_icg_scan_en   ),
+  .vfdsu_ex2_div         (vfdsu_ex2_div        ),
+  .vfdsu_ex2_double      (vfdsu_ex2_double     ),
+  .vfdsu_ex2_dz          (vfdsu_ex2_dz         ),
+  .vfdsu_ex2_expnt_add0  (vfdsu_ex2_expnt_add0 ),
+  .vfdsu_ex2_expnt_add1  (vfdsu_ex2_expnt_add1 ),
+  .vfdsu_ex2_nv          (vfdsu_ex2_nv         ),
+  .vfdsu_ex2_of_rm_lfn   (vfdsu_ex2_of_rm_lfn  ),
+  .vfdsu_ex2_op0_norm    (vfdsu_ex2_op0_norm   ),
+  .vfdsu_ex2_op1_norm    (vfdsu_ex2_op1_norm   ),
+  .vfdsu_ex2_qnan_f      (vfdsu_ex2_qnan_f     ),
+  .vfdsu_ex2_qnan_sign   (vfdsu_ex2_qnan_sign  ),
+  .vfdsu_ex2_result_inf  (vfdsu_ex2_result_inf ),
+  .vfdsu_ex2_result_qnan (vfdsu_ex2_result_qnan),
+  .vfdsu_ex2_result_sign (vfdsu_ex2_result_sign),
+  .vfdsu_ex2_result_zero (vfdsu_ex2_result_zero),
+  .vfdsu_ex2_rm          (vfdsu_ex2_rm         ),
+  .vfdsu_ex2_single      (vfdsu_ex2_single     ),
+  .vfdsu_ex2_sqrt        (vfdsu_ex2_sqrt       ),
+  .vfdsu_ex2_srt_skip    (vfdsu_ex2_srt_skip   ),
+  .vfpu_yy_xx_dqnan      (vfpu_yy_xx_dqnan     ),
+  .vfpu_yy_xx_rm         (vfpu_yy_xx_rm        )
+);
+
+// &Instance("ct_vfdsu_srt"); @29
+ct_vfdsu_srt  x_ct_vfdsu_srt (
+  .cp0_vfpu_icg_en                       (cp0_vfpu_icg_en                      ),
+  .cp0_yy_clk_en                         (cp0_yy_clk_en                        ),
+  .cpurst_b                              (cpurst_b                             ),
+  .ex1_div                               (ex1_div                              ),
+  .ex1_divisor                           (ex1_divisor                          ),
+  .ex1_pipedown                          (ex1_pipedown                         ),
+  .ex1_remainder                         (ex1_remainder                        ),
+  .ex1_sqrt                              (ex1_sqrt                             ),
+  .ex2_pipedown                          (ex2_pipedown                         ),
+  .ex2_srt_first_round                   (ex2_srt_first_round                  ),
+  .forever_cpuclk                        (forever_cpuclk                       ),
+  .pad_yy_icg_scan_en                    (pad_yy_icg_scan_en                   ),
+  .srt_ctrl_rem_zero                     (srt_ctrl_rem_zero                    ),
+  .srt_ctrl_skip_srt                     (srt_ctrl_skip_srt                    ),
+  .srt_secd_round                        (srt_secd_round                       ),
+  .srt_sm_on                             (srt_sm_on                            ),
+  .total_qt_rt_58                        (total_qt_rt_58                       ),
+  .vfdsu_ex2_div                         (vfdsu_ex2_div                        ),
+  .vfdsu_ex2_double                      (vfdsu_ex2_double                     ),
+  .vfdsu_ex2_dz                          (vfdsu_ex2_dz                         ),
+  .vfdsu_ex2_expnt_add0                  (vfdsu_ex2_expnt_add0                 ),
+  .vfdsu_ex2_expnt_add1                  (vfdsu_ex2_expnt_add1                 ),
+  .vfdsu_ex2_nv                          (vfdsu_ex2_nv                         ),
+  .vfdsu_ex2_of_rm_lfn                   (vfdsu_ex2_of_rm_lfn                  ),
+  .vfdsu_ex2_op0_norm                    (vfdsu_ex2_op0_norm                   ),
+  .vfdsu_ex2_op1_norm                    (vfdsu_ex2_op1_norm                   ),
+  .vfdsu_ex2_qnan_f                      (vfdsu_ex2_qnan_f                     ),
+  .vfdsu_ex2_qnan_sign                   (vfdsu_ex2_qnan_sign                  ),
+  .vfdsu_ex2_result_inf                  (vfdsu_ex2_result_inf                 ),
+  .vfdsu_ex2_result_qnan                 (vfdsu_ex2_result_qnan                ),
+  .vfdsu_ex2_result_sign                 (vfdsu_ex2_result_sign                ),
+  .vfdsu_ex2_result_zero                 (vfdsu_ex2_result_zero                ),
+  .vfdsu_ex2_rm                          (vfdsu_ex2_rm                         ),
+  .vfdsu_ex2_single                      (vfdsu_ex2_single                     ),
+  .vfdsu_ex2_sqrt                        (vfdsu_ex2_sqrt                       ),
+  .vfdsu_ex2_srt_skip                    (vfdsu_ex2_srt_skip                   ),
+  .vfdsu_ex3_doub_expnt_rst              (vfdsu_ex3_doub_expnt_rst             ),
+  .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
+  .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
+  .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
+  .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
+  .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
+  .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
+  .vfdsu_ex3_potnt_of                    (vfdsu_ex3_potnt_of                   ),
+  .vfdsu_ex3_potnt_uf                    (vfdsu_ex3_potnt_uf                   ),
+  .vfdsu_ex3_qnan_f                      (vfdsu_ex3_qnan_f                     ),
+  .vfdsu_ex3_qnan_sign                   (vfdsu_ex3_qnan_sign                  ),
+  .vfdsu_ex3_rem_sign                    (vfdsu_ex3_rem_sign                   ),
+  .vfdsu_ex3_rem_zero                    (vfdsu_ex3_rem_zero                   ),
+  .vfdsu_ex3_result_denorm_round_add_num (vfdsu_ex3_result_denorm_round_add_num),
+  .vfdsu_ex3_result_inf                  (vfdsu_ex3_result_inf                 ),
+  .vfdsu_ex3_result_lfn                  (vfdsu_ex3_result_lfn                 ),
+  .vfdsu_ex3_result_qnan                 (vfdsu_ex3_result_qnan                ),
+  .vfdsu_ex3_result_sign                 (vfdsu_ex3_result_sign                ),
+  .vfdsu_ex3_result_zero                 (vfdsu_ex3_result_zero                ),
+  .vfdsu_ex3_rm                          (vfdsu_ex3_rm                         ),
+  .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
+  .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
+  .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
+  .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         )
+);
+
+// &Instance("ct_vfdsu_round"); @30
+ct_vfdsu_round  x_ct_vfdsu_round (
+  .cp0_vfpu_icg_en                       (cp0_vfpu_icg_en                      ),
+  .cp0_yy_clk_en                         (cp0_yy_clk_en                        ),
+  .cpurst_b                              (cpurst_b                             ),
+  .ex3_pipedown                          (ex3_pipedown                         ),
+  .forever_cpuclk                        (forever_cpuclk                       ),
+  .pad_yy_icg_scan_en                    (pad_yy_icg_scan_en                   ),
+  .total_qt_rt_58                        (total_qt_rt_58                       ),
+  .vfdsu_ex2_of_rm_lfn                   (vfdsu_ex2_of_rm_lfn                  ),
+  .vfdsu_ex3_doub_expnt_rst              (vfdsu_ex3_doub_expnt_rst             ),
+  .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
+  .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
+  .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
+  .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
+  .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
+  .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
+  .vfdsu_ex3_potnt_of                    (vfdsu_ex3_potnt_of                   ),
+  .vfdsu_ex3_potnt_uf                    (vfdsu_ex3_potnt_uf                   ),
+  .vfdsu_ex3_qnan_f                      (vfdsu_ex3_qnan_f                     ),
+  .vfdsu_ex3_qnan_sign                   (vfdsu_ex3_qnan_sign                  ),
+  .vfdsu_ex3_rem_sign                    (vfdsu_ex3_rem_sign                   ),
+  .vfdsu_ex3_rem_zero                    (vfdsu_ex3_rem_zero                   ),
+  .vfdsu_ex3_result_denorm_round_add_num (vfdsu_ex3_result_denorm_round_add_num),
+  .vfdsu_ex3_result_inf                  (vfdsu_ex3_result_inf                 ),
+  .vfdsu_ex3_result_lfn                  (vfdsu_ex3_result_lfn                 ),
+  .vfdsu_ex3_result_qnan                 (vfdsu_ex3_result_qnan                ),
+  .vfdsu_ex3_result_sign                 (vfdsu_ex3_result_sign                ),
+  .vfdsu_ex3_result_zero                 (vfdsu_ex3_result_zero                ),
+  .vfdsu_ex3_rm                          (vfdsu_ex3_rm                         ),
+  .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
+  .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
+  .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
+  .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         ),
+  .vfdsu_ex4_denorm_to_tiny_frac         (vfdsu_ex4_denorm_to_tiny_frac        ),
+  .vfdsu_ex4_double                      (vfdsu_ex4_double                     ),
+  .vfdsu_ex4_dz                          (vfdsu_ex4_dz                         ),
+  .vfdsu_ex4_expnt_rst                   (vfdsu_ex4_expnt_rst                  ),
+  .vfdsu_ex4_frac                        (vfdsu_ex4_frac                       ),
+  .vfdsu_ex4_nv                          (vfdsu_ex4_nv                         ),
+  .vfdsu_ex4_nx                          (vfdsu_ex4_nx                         ),
+  .vfdsu_ex4_of                          (vfdsu_ex4_of                         ),
+  .vfdsu_ex4_of_rst_lfn                  (vfdsu_ex4_of_rst_lfn                 ),
+  .vfdsu_ex4_potnt_norm                  (vfdsu_ex4_potnt_norm                 ),
+  .vfdsu_ex4_potnt_of                    (vfdsu_ex4_potnt_of                   ),
+  .vfdsu_ex4_potnt_uf                    (vfdsu_ex4_potnt_uf                   ),
+  .vfdsu_ex4_qnan_f                      (vfdsu_ex4_qnan_f                     ),
+  .vfdsu_ex4_qnan_sign                   (vfdsu_ex4_qnan_sign                  ),
+  .vfdsu_ex4_result_inf                  (vfdsu_ex4_result_inf                 ),
+  .vfdsu_ex4_result_lfn                  (vfdsu_ex4_result_lfn                 ),
+  .vfdsu_ex4_result_nor                  (vfdsu_ex4_result_nor                 ),
+  .vfdsu_ex4_result_qnan                 (vfdsu_ex4_result_qnan                ),
+  .vfdsu_ex4_result_sign                 (vfdsu_ex4_result_sign                ),
+  .vfdsu_ex4_result_zero                 (vfdsu_ex4_result_zero                ),
+  .vfdsu_ex4_rslt_denorm                 (vfdsu_ex4_rslt_denorm                ),
+  .vfdsu_ex4_single                      (vfdsu_ex4_single                     ),
+  .vfdsu_ex4_uf                          (vfdsu_ex4_uf                         )
+);
+
+// &Instance("ct_vfdsu_pack"); @31
+ct_vfdsu_pack  x_ct_vfdsu_pack (
+  .ex4_out_expt                  (ex4_out_expt                 ),
+  .ex4_out_result                (ex4_out_result               ),
+  .vfdsu_ex4_denorm_to_tiny_frac (vfdsu_ex4_denorm_to_tiny_frac),
+  .vfdsu_ex4_double              (vfdsu_ex4_double             ),
+  .vfdsu_ex4_dz                  (vfdsu_ex4_dz                 ),
+  .vfdsu_ex4_expnt_rst           (vfdsu_ex4_expnt_rst          ),
+  .vfdsu_ex4_frac                (vfdsu_ex4_frac               ),
+  .vfdsu_ex4_nv                  (vfdsu_ex4_nv                 ),
+  .vfdsu_ex4_nx                  (vfdsu_ex4_nx                 ),
+  .vfdsu_ex4_of                  (vfdsu_ex4_of                 ),
+  .vfdsu_ex4_of_rst_lfn          (vfdsu_ex4_of_rst_lfn         ),
+  .vfdsu_ex4_potnt_norm          (vfdsu_ex4_potnt_norm         ),
+  .vfdsu_ex4_potnt_of            (vfdsu_ex4_potnt_of           ),
+  .vfdsu_ex4_potnt_uf            (vfdsu_ex4_potnt_uf           ),
+  .vfdsu_ex4_qnan_f              (vfdsu_ex4_qnan_f             ),
+  .vfdsu_ex4_qnan_sign           (vfdsu_ex4_qnan_sign          ),
+  .vfdsu_ex4_result_inf          (vfdsu_ex4_result_inf         ),
+  .vfdsu_ex4_result_lfn          (vfdsu_ex4_result_lfn         ),
+  .vfdsu_ex4_result_nor          (vfdsu_ex4_result_nor         ),
+  .vfdsu_ex4_result_qnan         (vfdsu_ex4_result_qnan        ),
+  .vfdsu_ex4_result_sign         (vfdsu_ex4_result_sign        ),
+  .vfdsu_ex4_result_zero         (vfdsu_ex4_result_zero        ),
+  .vfdsu_ex4_rslt_denorm         (vfdsu_ex4_rslt_denorm        ),
+  .vfdsu_ex4_single              (vfdsu_ex4_single             ),
+  .vfdsu_ex4_uf                  (vfdsu_ex4_uf                 )
+);
+
+
+
+// &ModuleEnd; @34
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v
new file mode 100644
index 00000000..c6d2e867
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ff1.v
@@ -0,0 +1,99 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+module ct_vfdsu_ff1(
+  fanc_shift_num,
+  frac_bin_val,
+  frac_num
+);
+
+// &Ports; @22
+input   [51:0]  frac_num;      
+output  [51:0]  fanc_shift_num; 
+output  [12:0]  frac_bin_val;  
+
+// &Regs; @23
+reg     [51:0]  fanc_shift_num; 
+reg     [12:0]  frac_bin_val;  
+
+// &Wires; @24
+wire    [51:0]  frac_num;      
+
+
+// &CombBeg; @26
+always @( frac_num[51:0])
+begin
+casez(frac_num[51:0])
+  52'b1???????????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h0;    fanc_shift_num[51:0] = frac_num[51:0];       end
+  52'b01??????????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1fff; fanc_shift_num[51:0] = {frac_num[50:0],1'b0};end
+  52'b001?????????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ffe; fanc_shift_num[51:0] = {frac_num[49:0],2'b0};end
+  52'b0001????????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ffd; fanc_shift_num[51:0] = {frac_num[48:0],3'b0};end
+  52'b00001???????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ffc; fanc_shift_num[51:0] = {frac_num[47:0],4'b0};end
+  52'b000001??????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ffb; fanc_shift_num[51:0] = {frac_num[46:0],5'b0};end
+  52'b0000001?????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ffa; fanc_shift_num[51:0] = {frac_num[45:0],6'b0};end
+  52'b00000001????????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff9; fanc_shift_num[51:0] = {frac_num[44:0],7'b0};end
+  52'b000000001???????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff8; fanc_shift_num[51:0] = {frac_num[43:0],8'b0};end
+  52'b0000000001??????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff7; fanc_shift_num[51:0] = {frac_num[42:0],9'b0};end
+  52'b00000000001?????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff6; fanc_shift_num[51:0] = {frac_num[41:0],10'b0};end
+  52'b000000000001????????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff5; fanc_shift_num[51:0] = {frac_num[40:0],11'b0};end
+  52'b0000000000001???????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff4; fanc_shift_num[51:0] = {frac_num[39:0],12'b0};end
+  52'b00000000000001??????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff3; fanc_shift_num[51:0] = {frac_num[38:0],13'b0};end
+  52'b000000000000001?????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff2; fanc_shift_num[51:0] = {frac_num[37:0],14'b0};end
+  52'b0000000000000001????????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff1; fanc_shift_num[51:0] = {frac_num[36:0],15'b0};end
+  52'b00000000000000001???????????????????????????????????: begin frac_bin_val[12:0] = 13'h1ff0; fanc_shift_num[51:0] = {frac_num[35:0],16'b0};end
+  52'b000000000000000001??????????????????????????????????: begin frac_bin_val[12:0] = 13'h1fef; fanc_shift_num[51:0] = {frac_num[34:0],17'b0};end
+  52'b0000000000000000001?????????????????????????????????: begin frac_bin_val[12:0] = 13'h1fee; fanc_shift_num[51:0] = {frac_num[33:0],18'b0};end
+  52'b00000000000000000001????????????????????????????????: begin frac_bin_val[12:0] = 13'h1fed; fanc_shift_num[51:0] = {frac_num[32:0],19'b0};end
+  52'b000000000000000000001???????????????????????????????: begin frac_bin_val[12:0] = 13'h1fec; fanc_shift_num[51:0] = {frac_num[31:0],20'b0};end
+  52'b0000000000000000000001??????????????????????????????: begin frac_bin_val[12:0] = 13'h1feb; fanc_shift_num[51:0] = {frac_num[30:0],21'b0};end
+  52'b00000000000000000000001?????????????????????????????: begin frac_bin_val[12:0] = 13'h1fea; fanc_shift_num[51:0] = {frac_num[29:0],22'b0};end
+  52'b000000000000000000000001????????????????????????????: begin frac_bin_val[12:0] = 13'h1fe9; fanc_shift_num[51:0] = {frac_num[28:0],23'b0};end
+  52'b0000000000000000000000001???????????????????????????: begin frac_bin_val[12:0] = 13'h1fe8; fanc_shift_num[51:0] = {frac_num[27:0],24'b0};end
+  52'b00000000000000000000000001??????????????????????????: begin frac_bin_val[12:0] = 13'h1fe7; fanc_shift_num[51:0] = {frac_num[26:0],25'b0};end
+  52'b000000000000000000000000001?????????????????????????: begin frac_bin_val[12:0] = 13'h1fe6; fanc_shift_num[51:0] = {frac_num[25:0],26'b0};end
+  52'b0000000000000000000000000001????????????????????????: begin frac_bin_val[12:0] = 13'h1fe5; fanc_shift_num[51:0] = {frac_num[24:0],27'b0};end
+  52'b00000000000000000000000000001???????????????????????: begin frac_bin_val[12:0] = 13'h1fe4; fanc_shift_num[51:0] = {frac_num[23:0],28'b0};end
+  52'b000000000000000000000000000001??????????????????????: begin frac_bin_val[12:0] = 13'h1fe3; fanc_shift_num[51:0] = {frac_num[22:0],29'b0};end
+  52'b0000000000000000000000000000001?????????????????????: begin frac_bin_val[12:0] = 13'h1fe2; fanc_shift_num[51:0] = {frac_num[21:0],30'b0};end
+  52'b00000000000000000000000000000001????????????????????: begin frac_bin_val[12:0] = 13'h1fe1; fanc_shift_num[51:0] = {frac_num[20:0],31'b0};end
+  52'b000000000000000000000000000000001???????????????????: begin frac_bin_val[12:0] = 13'h1fe0; fanc_shift_num[51:0] = {frac_num[19:0],32'b0};end
+  52'b0000000000000000000000000000000001??????????????????: begin frac_bin_val[12:0] = 13'h1fdf; fanc_shift_num[51:0] = {frac_num[18:0],33'b0};end
+  52'b00000000000000000000000000000000001?????????????????: begin frac_bin_val[12:0] = 13'h1fde; fanc_shift_num[51:0] = {frac_num[17:0],34'b0};end
+  52'b000000000000000000000000000000000001????????????????: begin frac_bin_val[12:0] = 13'h1fdd; fanc_shift_num[51:0] = {frac_num[16:0],35'b0};end
+  52'b0000000000000000000000000000000000001???????????????: begin frac_bin_val[12:0] = 13'h1fdc; fanc_shift_num[51:0] = {frac_num[15:0],36'b0};end
+  52'b00000000000000000000000000000000000001??????????????: begin frac_bin_val[12:0] = 13'h1fdb; fanc_shift_num[51:0] = {frac_num[14:0],37'b0};end
+  52'b000000000000000000000000000000000000001?????????????: begin frac_bin_val[12:0] = 13'h1fda; fanc_shift_num[51:0] = {frac_num[13:0],38'b0};end
+  52'b0000000000000000000000000000000000000001????????????: begin frac_bin_val[12:0] = 13'h1fd9; fanc_shift_num[51:0] = {frac_num[12:0],39'b0};end
+  52'b00000000000000000000000000000000000000001???????????: begin frac_bin_val[12:0] = 13'h1fd8; fanc_shift_num[51:0] = {frac_num[11:0],40'b0};end
+  52'b000000000000000000000000000000000000000001??????????: begin frac_bin_val[12:0] = 13'h1fd7; fanc_shift_num[51:0] = {frac_num[10:0],41'b0};end
+  52'b0000000000000000000000000000000000000000001?????????: begin frac_bin_val[12:0] = 13'h1fd6; fanc_shift_num[51:0] = {frac_num[9:0],42'b0};end
+  52'b00000000000000000000000000000000000000000001????????: begin frac_bin_val[12:0] = 13'h1fd5; fanc_shift_num[51:0] = {frac_num[8:0],43'b0};end
+  52'b000000000000000000000000000000000000000000001???????: begin frac_bin_val[12:0] = 13'h1fd4; fanc_shift_num[51:0] = {frac_num[7:0],44'b0};end
+  52'b0000000000000000000000000000000000000000000001??????: begin frac_bin_val[12:0] = 13'h1fd3; fanc_shift_num[51:0] = {frac_num[6:0],45'b0};end
+  52'b00000000000000000000000000000000000000000000001?????: begin frac_bin_val[12:0] = 13'h1fd2; fanc_shift_num[51:0] = {frac_num[5:0],46'b0};end
+  52'b000000000000000000000000000000000000000000000001????: begin frac_bin_val[12:0] = 13'h1fd1; fanc_shift_num[51:0] = {frac_num[4:0],47'b0};end
+  52'b0000000000000000000000000000000000000000000000001???: begin frac_bin_val[12:0] = 13'h1fd0; fanc_shift_num[51:0] = {frac_num[3:0],48'b0};end
+  52'b00000000000000000000000000000000000000000000000001??: begin frac_bin_val[12:0] = 13'h1fcf; fanc_shift_num[51:0] = {frac_num[2:0],49'b0};end
+  52'b000000000000000000000000000000000000000000000000001?: begin frac_bin_val[12:0] = 13'h1fce; fanc_shift_num[51:0] = {frac_num[1:0],50'b0};end
+  52'b0000000000000000000000000000000000000000000000000001: begin frac_bin_val[12:0] = 13'h1fcd; fanc_shift_num[51:0] = {frac_num[0:0],51'b0};end
+  52'b0000000000000000000000000000000000000000000000000000: begin frac_bin_val[12:0] = 13'h1fcc; fanc_shift_num[51:0] = {52'b0};end
+  default:begin frac_bin_val[12:0] = 13'h000;     fanc_shift_num[51:0] = {52'b0};end
+endcase 
+// &CombEnd;                                                                                          @83
+end
+                                                                                                  
+// &ModuleEnd;                                                                                        @85
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
new file mode 100644
index 00000000..e1d2e18a
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
@@ -0,0 +1,417 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_pack(
+  ex4_out_expt,
+  ex4_out_result,
+  vfdsu_ex4_denorm_to_tiny_frac,
+  vfdsu_ex4_double,
+  vfdsu_ex4_dz,
+  vfdsu_ex4_expnt_rst,
+  vfdsu_ex4_frac,
+  vfdsu_ex4_nv,
+  vfdsu_ex4_nx,
+  vfdsu_ex4_of,
+  vfdsu_ex4_of_rst_lfn,
+  vfdsu_ex4_potnt_norm,
+  vfdsu_ex4_potnt_of,
+  vfdsu_ex4_potnt_uf,
+  vfdsu_ex4_qnan_f,
+  vfdsu_ex4_qnan_sign,
+  vfdsu_ex4_result_inf,
+  vfdsu_ex4_result_lfn,
+  vfdsu_ex4_result_nor,
+  vfdsu_ex4_result_qnan,
+  vfdsu_ex4_result_sign,
+  vfdsu_ex4_result_zero,
+  vfdsu_ex4_rslt_denorm,
+  vfdsu_ex4_single,
+  vfdsu_ex4_uf
+);
+
+// &Ports; @23
+input           vfdsu_ex4_denorm_to_tiny_frac; 
+input           vfdsu_ex4_double;             
+input           vfdsu_ex4_dz;                 
+input   [12:0]  vfdsu_ex4_expnt_rst;          
+input   [54:0]  vfdsu_ex4_frac;               
+input           vfdsu_ex4_nv;                 
+input           vfdsu_ex4_nx;                 
+input           vfdsu_ex4_of;                 
+input           vfdsu_ex4_of_rst_lfn;         
+input   [1 :0]  vfdsu_ex4_potnt_norm;         
+input           vfdsu_ex4_potnt_of;           
+input           vfdsu_ex4_potnt_uf;           
+input   [51:0]  vfdsu_ex4_qnan_f;             
+input           vfdsu_ex4_qnan_sign;          
+input           vfdsu_ex4_result_inf;         
+input           vfdsu_ex4_result_lfn;         
+input           vfdsu_ex4_result_nor;         
+input           vfdsu_ex4_result_qnan;        
+input           vfdsu_ex4_result_sign;        
+input           vfdsu_ex4_result_zero;        
+input           vfdsu_ex4_rslt_denorm;        
+input           vfdsu_ex4_single;             
+input           vfdsu_ex4_uf;                 
+output  [4 :0]  ex4_out_expt;                 
+output  [63:0]  ex4_out_result;               
+
+// &Regs; @24
+reg     [51:0]  ex4_denorm_frac;              
+reg     [51:0]  ex4_frac_52;                  
+reg     [51:0]  ex4_half_denorm_frac;         
+reg     [63:0]  ex4_out_result;               
+reg     [51:0]  ex4_single_denorm_frac;       
+reg     [12:0]  expnt_add_op1;                
+
+// &Wires; @25
+wire            ex4_cor_nx;                   
+wire            ex4_cor_uf;                   
+wire            ex4_denorm_potnt_norm;        
+wire    [63:0]  ex4_denorm_result;            
+wire    [63:0]  ex4_doub_lfn;                 
+wire    [63:0]  ex4_doub_rst0;                
+wire    [63:0]  ex4_doub_rst_inf;             
+wire    [63:0]  ex4_doub_rst_norm;            
+wire    [63:0]  ex4_doub_rst_qnan;            
+wire    [12:0]  ex4_expnt_rst;                
+wire            ex4_final_rst_norm;           
+wire    [54:0]  ex4_frac;                     
+wire    [63:0]  ex4_half_lfn;                 
+wire    [63:0]  ex4_half_rst0;                
+wire    [63:0]  ex4_half_rst_inf;             
+wire    [63:0]  ex4_half_rst_norm;            
+wire    [63:0]  ex4_half_rst_qnan;            
+wire            ex4_of_plus;                  
+wire    [4 :0]  ex4_out_expt;                 
+wire            ex4_result_inf;               
+wire            ex4_result_lfn;               
+wire            ex4_rslt_denorm;              
+wire    [63:0]  ex4_rst0;                     
+wire    [63:0]  ex4_rst_inf;                  
+wire    [63:0]  ex4_rst_lfn;                  
+wire            ex4_rst_nor;                  
+wire    [63:0]  ex4_rst_norm;                 
+wire    [63:0]  ex4_rst_qnan;                 
+wire    [63:0]  ex4_sing_lfn;                 
+wire    [63:0]  ex4_sing_rst0;                
+wire    [63:0]  ex4_sing_rst_inf;             
+wire    [63:0]  ex4_sing_rst_norm;            
+wire    [63:0]  ex4_sing_rst_qnan;            
+wire            ex4_uf_plus;                  
+wire            vfdsu_ex4_denorm_to_tiny_frac; 
+wire            vfdsu_ex4_double;             
+wire            vfdsu_ex4_dz;                 
+wire    [12:0]  vfdsu_ex4_expnt_rst;          
+wire    [54:0]  vfdsu_ex4_frac;               
+wire            vfdsu_ex4_nv;                 
+wire            vfdsu_ex4_nx;                 
+wire            vfdsu_ex4_of;                 
+wire            vfdsu_ex4_of_rst_lfn;         
+wire    [1 :0]  vfdsu_ex4_potnt_norm;         
+wire            vfdsu_ex4_potnt_of;           
+wire            vfdsu_ex4_potnt_uf;           
+wire    [51:0]  vfdsu_ex4_qnan_f;             
+wire            vfdsu_ex4_qnan_sign;          
+wire            vfdsu_ex4_result_inf;         
+wire            vfdsu_ex4_result_lfn;         
+wire            vfdsu_ex4_result_nor;         
+wire            vfdsu_ex4_result_qnan;        
+wire            vfdsu_ex4_result_sign;        
+wire            vfdsu_ex4_result_zero;        
+wire            vfdsu_ex4_rslt_denorm;        
+wire            vfdsu_ex4_single;             
+wire            vfdsu_ex4_uf;                 
+
+
+//============================EX4 STAGE=====================
+assign ex4_frac[54:0] = vfdsu_ex4_frac[54:0];
+//exponent adder
+// &CombBeg; @30
+always @( ex4_frac[54:53])
+begin
+casez(ex4_frac[54:53])
+  2'b00   : expnt_add_op1[12:0] = 13'h0fff;  //the expnt sub 1
+  2'b01   : expnt_add_op1[12:0] = 13'h0;    //the expnt stay the origi
+  2'b1?   : expnt_add_op1[12:0] = 13'h1;    // the exptn add 1
+  default : expnt_add_op1[12:0] = 13'b0;  
+endcase
+// &CombEnd; @37
+end
+assign ex4_expnt_rst[12:0] = vfdsu_ex4_expnt_rst[12:0] + 
+                             expnt_add_op1[12:0];
+
+//==========================Result Pack=====================
+
+// result denormal pack 
+// shift to the denormal number
+// &CombBeg; @45
+always @( vfdsu_ex4_expnt_rst[12:0]
+       or ex4_frac[54:1]
+       or vfdsu_ex4_denorm_to_tiny_frac)
+begin
+case(vfdsu_ex4_expnt_rst[12:0])
+  13'h1:   ex4_denorm_frac[51:0] = {      ex4_frac[52:1]}; //-1022 1
+  13'h0:   ex4_denorm_frac[51:0] = {      ex4_frac[53:2]}; //-1023 0
+  13'h1fff:ex4_denorm_frac[51:0] = {      ex4_frac[54:3]}; //-1024 -1
+  13'h1ffe:ex4_denorm_frac[51:0] = {1'b0, ex4_frac[54:4]}; //-1025 -2
+  13'h1ffd:ex4_denorm_frac[51:0] = {2'b0, ex4_frac[54:5]}; //-1026 -3
+  13'h1ffc:ex4_denorm_frac[51:0] = {3'b0, ex4_frac[54:6]}; //-1027 -4
+  13'h1ffb:ex4_denorm_frac[51:0] = {4'b0, ex4_frac[54:7]}; //-1028 -5
+  13'h1ffa:ex4_denorm_frac[51:0] = {5'b0, ex4_frac[54:8]}; //-1029 -6
+  13'h1ff9:ex4_denorm_frac[51:0] = {6'b0, ex4_frac[54:9]}; //-1030 -7
+  13'h1ff8:ex4_denorm_frac[51:0] = {7'b0, ex4_frac[54:10]}; //-1031 -8
+  13'h1ff7:ex4_denorm_frac[51:0] = {8'b0, ex4_frac[54:11]}; //-1032 -9
+  13'h1ff6:ex4_denorm_frac[51:0] = {9'b0, ex4_frac[54:12]}; //-1033 -10
+  13'h1ff5:ex4_denorm_frac[51:0] = {10'b0,ex4_frac[54:13]}; //-1034 -11
+  13'h1ff4:ex4_denorm_frac[51:0] = {11'b0,ex4_frac[54:14]}; //-1035 -12
+  13'h1ff3:ex4_denorm_frac[51:0] = {12'b0,ex4_frac[54:15]}; //-1036 -13  
+  13'h1ff2:ex4_denorm_frac[51:0] = {13'b0,ex4_frac[54:16]}; // -1037
+  13'h1ff1:ex4_denorm_frac[51:0] = {14'b0,ex4_frac[54:17]}; //-1038
+  13'h1ff0:ex4_denorm_frac[51:0] = {15'b0,ex4_frac[54:18]}; //-1039
+  13'h1fef:ex4_denorm_frac[51:0] = {16'b0,ex4_frac[54:19]}; //-1040
+  13'h1fee:ex4_denorm_frac[51:0] = {17'b0,ex4_frac[54:20]}; //-1041
+  13'h1fed:ex4_denorm_frac[51:0] = {18'b0,ex4_frac[54:21]}; //-1042
+  13'h1fec:ex4_denorm_frac[51:0] = {19'b0,ex4_frac[54:22]}; //-1043
+  13'h1feb:ex4_denorm_frac[51:0] = {20'b0,ex4_frac[54:23]}; //-1044
+  13'h1fea:ex4_denorm_frac[51:0] = {21'b0,ex4_frac[54:24]}; //-1045
+  13'h1fe9:ex4_denorm_frac[51:0] = {22'b0,ex4_frac[54:25]}; //-1046
+  13'h1fe8:ex4_denorm_frac[51:0] = {23'b0,ex4_frac[54:26]}; //-1047
+  13'h1fe7:ex4_denorm_frac[51:0] = {24'b0,ex4_frac[54:27]}; //-1048
+  13'h1fe6:ex4_denorm_frac[51:0] = {25'b0,ex4_frac[54:28]}; //-1049
+  13'h1fe5:ex4_denorm_frac[51:0] = {26'b0,ex4_frac[54:29]}; //-1050
+  13'h1fe4:ex4_denorm_frac[51:0] = {27'b0,ex4_frac[54:30]}; //-1056
+  13'h1fe3:ex4_denorm_frac[51:0] = {28'b0,ex4_frac[54:31]}; //-1052
+  13'h1fe2:ex4_denorm_frac[51:0] = {29'b0,ex4_frac[54:32]}; //-1053
+  13'h1fe1:ex4_denorm_frac[51:0] = {30'b0,ex4_frac[54:33]}; //-1054
+  13'h1fe0:ex4_denorm_frac[51:0] = {31'b0,ex4_frac[54:34]}; //-1055
+  13'h1fdf:ex4_denorm_frac[51:0] = {32'b0,ex4_frac[54:35]}; //-1056
+  13'h1fde:ex4_denorm_frac[51:0] = {33'b0,ex4_frac[54:36]}; //-1057
+  13'h1fdd:ex4_denorm_frac[51:0] = {34'b0,ex4_frac[54:37]}; //-1058
+  13'h1fdc:ex4_denorm_frac[51:0] = {35'b0,ex4_frac[54:38]}; //-1059
+  13'h1fdb:ex4_denorm_frac[51:0] = {36'b0,ex4_frac[54:39]}; //-1060
+  13'h1fda:ex4_denorm_frac[51:0] = {37'b0,ex4_frac[54:40]}; //-1061
+  13'h1fd9:ex4_denorm_frac[51:0] = {38'b0,ex4_frac[54:41]}; //-1062
+  13'h1fd8:ex4_denorm_frac[51:0] = {39'b0,ex4_frac[54:42]}; //-1063
+  13'h1fd7:ex4_denorm_frac[51:0] = {40'b0,ex4_frac[54:43]}; //-1064
+  13'h1fd6:ex4_denorm_frac[51:0] = {41'b0,ex4_frac[54:44]}; //-1065
+  13'h1fd5:ex4_denorm_frac[51:0] = {42'b0,ex4_frac[54:45]};  //-1066
+  13'h1fd4:ex4_denorm_frac[51:0] = {43'b0,ex4_frac[54:46]};  //-1067
+  13'h1fd3:ex4_denorm_frac[51:0] = {44'b0,ex4_frac[54:47]};  //-1068
+  13'h1fd2:ex4_denorm_frac[51:0] = {45'b0,ex4_frac[54:48]};  //-1069
+  13'h1fd1:ex4_denorm_frac[51:0] = {46'b0,ex4_frac[54:49]};  //-1070
+  13'h1fd0:ex4_denorm_frac[51:0] = {47'b0,ex4_frac[54:50]};  //-1071
+  13'h1fcf:ex4_denorm_frac[51:0] = {48'b0,ex4_frac[54:51]};  //-1072
+  13'h1fce:ex4_denorm_frac[51:0] = {49'b0,ex4_frac[54:52]};  //-1073
+  13'h1fcd:ex4_denorm_frac[51:0] = {50'b0,ex4_frac[54:53]};  //-1074
+  default: ex4_denorm_frac[51:0] = vfdsu_ex4_denorm_to_tiny_frac ? 52'b1 : 52'b0;
+endcase                                                                  
+// &CombEnd;    @102
+end
+// &CombBeg; @103
+always @( vfdsu_ex4_expnt_rst[12:0]
+       or ex4_frac[54:1]
+       or vfdsu_ex4_denorm_to_tiny_frac)
+begin
+case(vfdsu_ex4_expnt_rst[12:0])
+  13'h1:   ex4_single_denorm_frac[51:0] = {      ex4_frac[52:1]}; //-1022 1
+  13'h0:   ex4_single_denorm_frac[51:0] = {      ex4_frac[53:2]}; //-1023 0
+  13'h1fff:ex4_single_denorm_frac[51:0] = {      ex4_frac[54:3]}; //-1024 -1
+  13'h1ffe:ex4_single_denorm_frac[51:0] = {1'b0, ex4_frac[54:4]}; //-1025 -2
+  13'h1ffd:ex4_single_denorm_frac[51:0] = {2'b0, ex4_frac[54:5]}; //-1026 -3
+  13'h1ffc:ex4_single_denorm_frac[51:0] = {3'b0, ex4_frac[54:6]}; //-1027 -4
+  13'h1ffb:ex4_single_denorm_frac[51:0] = {4'b0, ex4_frac[54:7]}; //-1028 -5
+  13'h1ffa:ex4_single_denorm_frac[51:0] = {5'b0, ex4_frac[54:8]}; //-1029 -6
+  13'h1ff9:ex4_single_denorm_frac[51:0] = {6'b0, ex4_frac[54:9]}; //-1030 -7
+  13'h1ff8:ex4_single_denorm_frac[51:0] = {7'b0, ex4_frac[54:10]}; //-1031 -8
+  13'h1ff7:ex4_single_denorm_frac[51:0] = {8'b0, ex4_frac[54:11]}; //-1032 -9
+  13'h1ff6:ex4_single_denorm_frac[51:0] = {9'b0, ex4_frac[54:12]}; //-1033 -10
+  13'h1ff5:ex4_single_denorm_frac[51:0] = {10'b0,ex4_frac[54:13]}; //-1034 -11
+  13'h1ff4:ex4_single_denorm_frac[51:0] = {11'b0,ex4_frac[54:14]}; //-1035 -12
+  13'h1ff3:ex4_single_denorm_frac[51:0] = {12'b0,ex4_frac[54:15]}; //-1036 -13  
+  13'h1ff2:ex4_single_denorm_frac[51:0] = {13'b0,ex4_frac[54:16]}; // -1037
+  13'h1ff1:ex4_single_denorm_frac[51:0] = {14'b0,ex4_frac[54:17]}; //-1038
+  13'h1ff0:ex4_single_denorm_frac[51:0] = {15'b0,ex4_frac[54:18]}; //-1039
+  13'h1fef:ex4_single_denorm_frac[51:0] = {16'b0,ex4_frac[54:19]}; //-1040
+  13'h1fee:ex4_single_denorm_frac[51:0] = {17'b0,ex4_frac[54:20]}; //-1041
+  13'h1fed:ex4_single_denorm_frac[51:0] = {18'b0,ex4_frac[54:21]}; //-1042
+  13'h1fec:ex4_single_denorm_frac[51:0] = {19'b0,ex4_frac[54:22]}; //-1043
+  13'h1feb:ex4_single_denorm_frac[51:0] = {20'b0,ex4_frac[54:23]}; //-1044
+  13'h1fea:ex4_single_denorm_frac[51:0] = {21'b0,ex4_frac[54:24]}; //-1044
+  default :ex4_single_denorm_frac[51:0] = vfdsu_ex4_denorm_to_tiny_frac ?{23'b1,29'b0} : 52'b0; //-1045
+endcase                                                                  
+// &CombEnd; @131
+end
+// &CombBeg; @132
+always @( vfdsu_ex4_expnt_rst[12:0]
+       or ex4_frac[54:1]
+       or vfdsu_ex4_denorm_to_tiny_frac)
+begin
+case(vfdsu_ex4_expnt_rst[12:0])
+  13'h1:   ex4_half_denorm_frac[51:0] = {      ex4_frac[52:1]}; //-1022 1
+  13'h0:   ex4_half_denorm_frac[51:0] = {      ex4_frac[53:2]}; //-1023 0
+  13'h1fff:ex4_half_denorm_frac[51:0] = {      ex4_frac[54:3]}; //-1024 -1
+  13'h1ffe:ex4_half_denorm_frac[51:0] = {1'b0, ex4_frac[54:4]}; //-1025 -2
+  13'h1ffd:ex4_half_denorm_frac[51:0] = {2'b0, ex4_frac[54:5]}; //-1026 -3
+  13'h1ffc:ex4_half_denorm_frac[51:0] = {3'b0, ex4_frac[54:6]}; //-1027 -4
+  13'h1ffb:ex4_half_denorm_frac[51:0] = {4'b0, ex4_frac[54:7]}; //-1028 -5
+  13'h1ffa:ex4_half_denorm_frac[51:0] = {5'b0, ex4_frac[54:8]}; //-1029 -6
+  13'h1ff9:ex4_half_denorm_frac[51:0] = {6'b0, ex4_frac[54:9]}; //-1030 -7
+  13'h1ff8:ex4_half_denorm_frac[51:0] = {7'b0, ex4_frac[54:10]}; //-1031 -8
+  13'h1ff7:ex4_half_denorm_frac[51:0] = {8'b0, ex4_frac[54:11]}; //-1032 -9
+  default :ex4_half_denorm_frac[51:0] = vfdsu_ex4_denorm_to_tiny_frac ?{10'b1,42'b0} : 52'b0; //-1045
+endcase                                                                  
+// &CombEnd; @147
+end
+
+//here when denormal number round to add1, it will become normal number
+assign ex4_denorm_potnt_norm    = (vfdsu_ex4_potnt_norm[1] && ex4_frac[53]) || 
+                                  (vfdsu_ex4_potnt_norm[0] && ex4_frac[54]) ;
+assign ex4_rslt_denorm          = !vfdsu_ex4_result_qnan 
+                                  && !vfdsu_ex4_result_zero 
+                                  && (vfdsu_ex4_rslt_denorm && !ex4_denorm_potnt_norm);
+assign ex4_denorm_result[63:0]  = vfdsu_ex4_double ? 
+                                  {vfdsu_ex4_result_sign,11'h0,ex4_denorm_frac[51:0]} :
+                                  vfdsu_ex4_single ? {32'hffffffff,vfdsu_ex4_result_sign,
+                                        8'h0,ex4_single_denorm_frac[51:29]}  : {
+                                        48'hffffffffffff,vfdsu_ex4_result_sign,5'h0,
+                                        ex4_half_denorm_frac[51:42]};
+
+                               
+
+assign ex4_half_lfn[63:0]      = {48'hffffffffffff,vfdsu_ex4_result_sign,5'h1e,{10{1'b1}}};
+assign ex4_half_rst_qnan[63:0] = {48'hffffffffffff,vfdsu_ex4_qnan_sign, 5'h1f,1'b1, vfdsu_ex4_qnan_f[8:0]};
+assign ex4_half_rst_inf[63:0]  = {48'hffffffffffff,vfdsu_ex4_result_sign,5'h1f,10'b0};
+assign ex4_half_rst_norm[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,
+                                  ex4_expnt_rst[4:0],
+                                  ex4_frac_52[51:42]};
+assign ex4_half_rst0[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,15'h0};                                
+//ex4 overflow/underflow plus                                 
+assign ex4_rst_nor = vfdsu_ex4_result_nor;                    
+assign ex4_of_plus = vfdsu_ex4_potnt_of  && 
+                     (|ex4_frac[54:53])  && 
+                     ex4_rst_nor;
+assign ex4_uf_plus = vfdsu_ex4_potnt_uf  && 
+                     (~|ex4_frac[54:53]) && 
+                     ex4_rst_nor;
+//ex4 overflow round result
+assign ex4_result_lfn = (ex4_of_plus &&  vfdsu_ex4_of_rst_lfn) ||
+                        vfdsu_ex4_result_lfn;
+assign ex4_result_inf = (ex4_of_plus && !vfdsu_ex4_of_rst_lfn) ||
+                        vfdsu_ex4_result_inf;
+//Special Result Form
+// result largest finity number
+assign ex4_doub_lfn[63:0]      = {vfdsu_ex4_result_sign,11'h7fe,{52{1'b1}}};
+assign ex4_sing_lfn[63:0]      = {32'hffffffff,vfdsu_ex4_result_sign,8'hfe,{23{1'b1}}};
+// result 0
+assign ex4_doub_rst0[63:0]     = {vfdsu_ex4_result_sign,63'b0};
+assign ex4_sing_rst0[63:0]     = {32'hffffffff,vfdsu_ex4_result_sign,31'b0};
+//result qNaN
+// &Force("bus","vfdsu_ex4_qnan_f",51,0); @192
+assign ex4_doub_rst_qnan[63:0] = {      vfdsu_ex4_qnan_sign, 11'h7ff, 1'b1, vfdsu_ex4_qnan_f[50:0]};
+assign ex4_sing_rst_qnan[63:0] = {32'hffffffff,vfdsu_ex4_qnan_sign, 8'hff,   1'b1, vfdsu_ex4_qnan_f[21:0]};
+//result infinity
+assign ex4_doub_rst_inf[63:0]  = {vfdsu_ex4_result_sign,11'h7ff,52'b0};
+assign ex4_sing_rst_inf[63:0]  = {32'hffffffff,vfdsu_ex4_result_sign,8'hff,23'b0};
+//result normal
+// &CombBeg; @199
+always @( ex4_frac[54:0])
+begin
+casez(ex4_frac[54:53])
+  2'b00   : ex4_frac_52[51:0]  = ex4_frac[51:0];
+  2'b01   : ex4_frac_52[51:0]  = ex4_frac[52:1];
+  2'b1?   : ex4_frac_52[51:0]  = ex4_frac[53:2];
+  default : ex4_frac_52[51:0]  = 52'b0;
+endcase
+// &CombEnd; @206
+end
+assign ex4_doub_rst_norm[63:0] = {vfdsu_ex4_result_sign,
+                                  ex4_expnt_rst[10:0],
+                                  ex4_frac_52[51:0]};
+assign ex4_sing_rst_norm[63:0] = {32'hffffffff,vfdsu_ex4_result_sign,
+                                  ex4_expnt_rst[7:0],
+                                  ex4_frac_52[51:29]};
+assign ex4_rst_lfn[63:0]       = (vfdsu_ex4_double) ? ex4_doub_lfn[63:0] :
+                                  vfdsu_ex4_single  ? ex4_sing_lfn[63:0] : ex4_half_lfn[63:0];
+
+assign ex4_rst0[63:0]          = (vfdsu_ex4_double) ? ex4_doub_rst0[63:0] :
+                                  vfdsu_ex4_single  ? ex4_sing_rst0[63:0] : ex4_half_rst0[63:0];
+
+assign ex4_rst_qnan[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_qnan[63:0] :
+                                  vfdsu_ex4_single  ? ex4_sing_rst_qnan[63:0] 
+                                                    : ex4_half_rst_qnan[63:0];
+
+assign ex4_rst_norm[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_norm[63:0] :
+                                  vfdsu_ex4_single  ? ex4_sing_rst_norm[63:0]
+                                                    : ex4_half_rst_norm[63:0];
+assign ex4_rst_inf[63:0]       = (vfdsu_ex4_double) ? ex4_doub_rst_inf[63:0] :
+                                  vfdsu_ex4_single  ? ex4_sing_rst_inf[63:0]
+                                                    : ex4_half_rst_inf[63:0];
+
+      
+assign ex4_cor_uf            = (vfdsu_ex4_uf && !ex4_denorm_potnt_norm || ex4_uf_plus)
+                               && vfdsu_ex4_nx;
+assign ex4_cor_nx            =  vfdsu_ex4_nx 
+                                || vfdsu_ex4_of 
+                                || ex4_of_plus;
+                                        
+assign ex4_out_expt[4:0]           = {
+                                  vfdsu_ex4_nv,
+                                  vfdsu_ex4_dz,
+                                  vfdsu_ex4_of | ex4_of_plus,
+                                  ex4_cor_uf,
+                                  ex4_cor_nx};
+
+assign ex4_final_rst_norm      = !vfdsu_ex4_result_qnan && 
+                                 !ex4_result_inf        &&
+                                 !ex4_result_lfn        &&
+                                 !vfdsu_ex4_result_zero &&
+                                 !ex4_rslt_denorm; 
+// &CombBeg; @249
+always @( ex4_rst_norm[63:0]
+       or ex4_result_lfn
+       or vfdsu_ex4_result_qnan
+       or ex4_rst_qnan[63:0]
+       or ex4_rst0[63:0]
+       or ex4_rslt_denorm
+       or ex4_denorm_result[63:0]
+       or ex4_result_inf
+       or ex4_final_rst_norm
+       or ex4_rst_lfn[63:0]
+       or vfdsu_ex4_result_zero
+       or ex4_rst_inf[63:0])
+begin
+case({ex4_rslt_denorm,
+      vfdsu_ex4_result_qnan,
+      ex4_result_inf,
+      ex4_result_lfn,
+      vfdsu_ex4_result_zero,
+      ex4_final_rst_norm})
+  6'b100000 : ex4_out_result[63:0]  = ex4_denorm_result[63:0];
+  6'b010000 : ex4_out_result[63:0]  = ex4_rst_qnan[63:0]; 
+  6'b001000 : ex4_out_result[63:0]  = ex4_rst_inf[63:0];
+  6'b000100 : ex4_out_result[63:0]  = ex4_rst_lfn[63:0];
+  6'b000010 : ex4_out_result[63:0]  = ex4_rst0[63:0];
+  6'b000001 : ex4_out_result[63:0]  = ex4_rst_norm[63:0];
+  default   : ex4_out_result[63:0]  = 64'b0;
+endcase
+// &CombEnd; @264
+end
+
+// &ModuleEnd; @266
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
new file mode 100644
index 00000000..7c5821c8
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
@@ -0,0 +1,773 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_prepare(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  ex1_div,
+  ex1_divisor,
+  ex1_double,
+  ex1_pipedown,
+  ex1_remainder,
+  ex1_scalar,
+  ex1_single,
+  ex1_sqrt,
+  ex1_src0,
+  ex1_src1,
+  ex1_static_rm,
+  forever_cpuclk,
+  pad_yy_icg_scan_en,
+  vfdsu_ex2_div,
+  vfdsu_ex2_double,
+  vfdsu_ex2_dz,
+  vfdsu_ex2_expnt_add0,
+  vfdsu_ex2_expnt_add1,
+  vfdsu_ex2_nv,
+  vfdsu_ex2_of_rm_lfn,
+  vfdsu_ex2_op0_norm,
+  vfdsu_ex2_op1_norm,
+  vfdsu_ex2_qnan_f,
+  vfdsu_ex2_qnan_sign,
+  vfdsu_ex2_result_inf,
+  vfdsu_ex2_result_qnan,
+  vfdsu_ex2_result_sign,
+  vfdsu_ex2_result_zero,
+  vfdsu_ex2_rm,
+  vfdsu_ex2_single,
+  vfdsu_ex2_sqrt,
+  vfdsu_ex2_srt_skip,
+  vfpu_yy_xx_dqnan,
+  vfpu_yy_xx_rm
+);
+
+// &Ports; @23
+input           cp0_vfpu_icg_en;          
+input           cp0_yy_clk_en;            
+input           cpurst_b;                 
+input           ex1_div;                  
+input           ex1_double;               
+input           ex1_pipedown;             
+input           ex1_scalar;               
+input           ex1_single;               
+input           ex1_sqrt;                 
+input   [63:0]  ex1_src0;                 
+input   [63:0]  ex1_src1;                 
+input   [2 :0]  ex1_static_rm;            
+input           forever_cpuclk;           
+input           pad_yy_icg_scan_en;       
+input           vfpu_yy_xx_dqnan;         
+input   [2 :0]  vfpu_yy_xx_rm;            
+output  [52:0]  ex1_divisor;              
+output  [59:0]  ex1_remainder;            
+output          vfdsu_ex2_div;            
+output          vfdsu_ex2_double;         
+output          vfdsu_ex2_dz;             
+output  [12:0]  vfdsu_ex2_expnt_add0;     
+output  [12:0]  vfdsu_ex2_expnt_add1;     
+output          vfdsu_ex2_nv;             
+output          vfdsu_ex2_of_rm_lfn;      
+output          vfdsu_ex2_op0_norm;       
+output          vfdsu_ex2_op1_norm;       
+output  [51:0]  vfdsu_ex2_qnan_f;         
+output          vfdsu_ex2_qnan_sign;      
+output          vfdsu_ex2_result_inf;     
+output          vfdsu_ex2_result_qnan;    
+output          vfdsu_ex2_result_sign;    
+output          vfdsu_ex2_result_zero;    
+output  [2 :0]  vfdsu_ex2_rm;             
+output          vfdsu_ex2_single;         
+output          vfdsu_ex2_sqrt;           
+output          vfdsu_ex2_srt_skip;       
+
+// &Regs; @24
+reg     [12:0]  ex1_expnt_adder_op1;      
+reg             ex1_of_result_lfn;        
+reg     [51:0]  ex1_qnan_f;               
+reg             ex1_qnan_sign;            
+reg             vfdsu_ex2_div;            
+reg             vfdsu_ex2_double;         
+reg             vfdsu_ex2_dz;             
+reg     [12:0]  vfdsu_ex2_expnt_add0;     
+reg     [12:0]  vfdsu_ex2_expnt_add1;     
+reg             vfdsu_ex2_nv;             
+reg             vfdsu_ex2_of_rm_lfn;      
+reg             vfdsu_ex2_op0_norm;       
+reg             vfdsu_ex2_op1_norm;       
+reg     [51:0]  vfdsu_ex2_qnan_f;         
+reg             vfdsu_ex2_qnan_sign;      
+reg             vfdsu_ex2_result_inf;     
+reg             vfdsu_ex2_result_qnan;    
+reg             vfdsu_ex2_result_sign;    
+reg             vfdsu_ex2_result_zero;    
+reg     [2 :0]  vfdsu_ex2_rm;             
+reg             vfdsu_ex2_single;         
+reg             vfdsu_ex2_sqrt;           
+reg             vfdsu_ex2_srt_skip;       
+
+// &Wires; @25
+wire            cp0_vfpu_icg_en;          
+wire            cp0_yy_clk_en;            
+wire            cpurst_b;                 
+wire            div_sign;                 
+wire            ex1_div;                  
+wire            ex1_div_dz;               
+wire    [52:0]  ex1_div_noid_nor_srt_op0; 
+wire    [52:0]  ex1_div_noid_nor_srt_op1; 
+wire    [52:0]  ex1_div_nor_srt_op0;      
+wire    [52:0]  ex1_div_nor_srt_op1;      
+wire            ex1_div_nv;               
+wire    [12:0]  ex1_div_op0_expnt;        
+wire    [12:0]  ex1_div_op1_expnt;        
+wire            ex1_div_rst_inf;          
+wire            ex1_div_rst_qnan;         
+wire            ex1_div_rst_zero;         
+wire    [52:0]  ex1_div_srt_op0;          
+wire    [52:0]  ex1_div_srt_op1;          
+wire    [52:0]  ex1_divisor;              
+wire            ex1_doub_expnt0_max;      
+wire            ex1_doub_expnt0_zero;     
+wire            ex1_doub_expnt1_max;      
+wire            ex1_doub_expnt1_zero;     
+wire            ex1_doub_frac0_all0;      
+wire            ex1_doub_frac1_all0;      
+wire            ex1_double;               
+wire            ex1_dz;                   
+wire            ex1_expnt0_max;           
+wire            ex1_expnt0_zero;          
+wire            ex1_expnt1_max;           
+wire            ex1_expnt1_zero;          
+wire    [12:0]  ex1_expnt_adder_op0;      
+wire            ex1_frac0_all0;           
+wire            ex1_frac0_msb;            
+wire            ex1_frac1_all0;           
+wire            ex1_frac1_msb;            
+wire            ex1_half_expnt0_max;      
+wire            ex1_half_expnt0_zero;     
+wire            ex1_half_expnt1_max;      
+wire            ex1_half_expnt1_zero;     
+wire            ex1_half_frac0_all0;      
+wire            ex1_half_frac1_all0;      
+wire            ex1_nv;                   
+wire            ex1_op0_cnan;             
+wire    [51:0]  ex1_op0_f;                
+wire            ex1_op0_id;               
+wire            ex1_op0_id_nor;           
+wire            ex1_op0_inf;              
+wire            ex1_op0_is_qnan;          
+wire            ex1_op0_is_snan;          
+wire            ex1_op0_norm;             
+wire            ex1_op0_qnan;             
+wire            ex1_op0_sign;             
+wire            ex1_op0_snan;             
+wire            ex1_op0_tt_zero;          
+wire            ex1_op0_zero;             
+wire            ex1_op1_cnan;             
+wire    [51:0]  ex1_op1_f;                
+wire            ex1_op1_id;               
+wire            ex1_op1_id_nor;           
+wire            ex1_op1_inf;              
+wire            ex1_op1_is_qnan;          
+wire            ex1_op1_is_snan;          
+wire            ex1_op1_norm;             
+wire            ex1_op1_qnan;             
+wire            ex1_op1_sign;             
+wire            ex1_op1_snan;             
+wire            ex1_op1_tt_zero;          
+wire            ex1_op1_zero;             
+wire    [63:0]  ex1_oper0;                
+wire    [51:0]  ex1_oper0_frac;           
+wire            ex1_oper0_high_all1;      
+wire    [12:0]  ex1_oper0_id_expnt;       
+wire    [51:0]  ex1_oper0_id_frac;        
+wire    [63:0]  ex1_oper1;                
+wire    [51:0]  ex1_oper1_frac;           
+wire            ex1_oper1_high_all1;      
+wire    [12:0]  ex1_oper1_id_expnt;       
+wire    [51:0]  ex1_oper1_id_frac;        
+wire            ex1_pipe_clk;             
+wire            ex1_pipe_clk_en;          
+wire            ex1_pipedown;             
+wire    [59:0]  ex1_remainder;            
+wire            ex1_result_inf;           
+wire            ex1_result_qnan;          
+wire            ex1_result_sign;          
+wire            ex1_result_zero;          
+wire    [2 :0]  ex1_rm;                   
+wire            ex1_rst_default_qnan;     
+wire            ex1_scalar;               
+wire            ex1_sing_expnt0_max;      
+wire            ex1_sing_expnt0_zero;     
+wire            ex1_sing_expnt1_max;      
+wire            ex1_sing_expnt1_zero;     
+wire            ex1_sing_frac0_all0;      
+wire            ex1_sing_frac1_all0;      
+wire            ex1_single;               
+wire            ex1_sqrt;                 
+wire            ex1_sqrt_expnt_odd;       
+wire            ex1_sqrt_expnt_result_odd; 
+wire            ex1_sqrt_nv;              
+wire    [12:0]  ex1_sqrt_op1_expnt;       
+wire            ex1_sqrt_rst_inf;         
+wire            ex1_sqrt_rst_qnan;        
+wire            ex1_sqrt_rst_zero;        
+wire    [52:0]  ex1_sqrt_srt_op0;         
+wire    [63:0]  ex1_src0;                 
+wire    [63:0]  ex1_src1;                 
+wire            ex1_srt_skip;             
+wire    [2 :0]  ex1_static_rm;            
+wire            forever_cpuclk;           
+wire            pad_yy_icg_scan_en;       
+wire    [59:0]  sqrt_remainder;           
+wire            sqrt_sign;                
+wire            vfpu_yy_xx_dqnan;         
+wire    [2 :0]  vfpu_yy_xx_rm;            
+
+
+//======================Operator prepare====================
+//VECTOR_SIMD
+
+assign ex1_oper0[63:0]             = ex1_src0[63:0];
+assign ex1_oper1[63:0]             = ex1_src1[63:0];
+
+
+//Sign bit prepare
+assign ex1_op0_sign                =  ex1_double ? ex1_oper0[63] :
+                                      ex1_single ? ex1_oper0[31] : ex1_oper0[15]; 
+assign ex1_op1_sign                =  ex1_double ? ex1_oper1[63] :
+                                      ex1_single ? ex1_oper1[31] : ex1_oper1[15]; 
+assign div_sign                    = ex1_op0_sign ^ ex1_op1_sign;
+assign sqrt_sign                   = ex1_op0_sign;
+assign ex1_result_sign             = (ex1_div)
+                                   ? div_sign 
+                                   : sqrt_sign;
+//exponent max
+assign ex1_doub_expnt0_max         = &ex1_oper0[62:52];
+assign ex1_sing_expnt0_max         = &ex1_oper0[30:23];
+assign ex1_doub_expnt1_max         = &ex1_oper1[62:52];
+assign ex1_sing_expnt1_max         = &ex1_oper1[30:23];
+assign ex1_half_expnt0_max         = &ex1_oper0[14:10];
+assign ex1_half_expnt1_max         = &ex1_oper1[14:10];
+assign ex1_expnt0_max              = ex1_double ? ex1_doub_expnt0_max :
+                                     ex1_single ? ex1_sing_expnt0_max : ex1_half_expnt0_max;
+assign ex1_expnt1_max              = ex1_double ? ex1_doub_expnt1_max :
+                                     ex1_single ? ex1_sing_expnt1_max : ex1_half_expnt1_max;
+             
+//exponent zero
+assign ex1_doub_expnt0_zero        = ~|ex1_oper0[62:52];
+assign ex1_sing_expnt0_zero        = ~|ex1_oper0[30:23];
+assign ex1_doub_expnt1_zero        = ~|ex1_oper1[62:52];
+assign ex1_sing_expnt1_zero        = ~|ex1_oper1[30:23];
+assign ex1_half_expnt0_zero        = ~|ex1_oper0[14:10];
+assign ex1_half_expnt1_zero        = ~|ex1_oper1[14:10];
+assign ex1_expnt0_zero             = ex1_double ? ex1_doub_expnt0_zero :
+                                     ex1_single ? ex1_sing_expnt0_zero : ex1_half_expnt0_zero;
+assign ex1_expnt1_zero             = ex1_double ? ex1_doub_expnt1_zero :
+                                     ex1_single ? ex1_sing_expnt1_zero : ex1_half_expnt1_zero; 
+//fraction zero
+assign ex1_doub_frac0_all0         = ~|ex1_oper0[51:0];
+assign ex1_sing_frac0_all0         = ~|ex1_oper0[22:0];
+assign ex1_doub_frac1_all0         = ~|ex1_oper1[51:0];
+assign ex1_sing_frac1_all0         = ~|ex1_oper1[22:0];
+assign ex1_half_frac0_all0         = ~|ex1_oper0[9:0];
+assign ex1_half_frac1_all0         = ~|ex1_oper1[9:0];
+assign ex1_frac0_all0              = ex1_double ? ex1_doub_frac0_all0 :
+                                     ex1_single ? ex1_sing_frac0_all0 : ex1_half_frac0_all0;   
+assign ex1_frac1_all0              = ex1_double ? ex1_doub_frac1_all0 :
+                                     ex1_single ? ex1_sing_frac1_all0 : ex1_half_frac1_all0;   
+assign ex1_frac0_msb               = ex1_double ? ex1_oper0[51] :
+                                     ex1_single ? ex1_oper0[22] : ex1_oper0[9];
+assign ex1_frac1_msb               = ex1_double ? ex1_oper1[51] :
+                                     ex1_single ? ex1_oper1[22] : ex1_oper1[9]; 
+assign ex1_oper0_high_all1         = ex1_single ? &ex1_oper0[63:32] : &ex1_oper0[63:16]; 
+assign ex1_oper1_high_all1         = ex1_single ? &ex1_oper1[63:32] : &ex1_oper1[63:16];
+ 
+
+//infinity number
+assign  ex1_op0_inf                = ex1_expnt0_max && 
+                                     ex1_frac0_all0 &&
+                                    ~ex1_op0_cnan;
+assign  ex1_op1_inf                = ex1_expnt1_max && 
+                                     ex1_frac1_all0 &&
+                                    ~ex1_op1_cnan;
+//zero
+assign ex1_op0_zero                = ex1_expnt0_zero && 
+                                     ex1_frac0_all0  &&
+                                    ~ex1_op0_cnan;
+assign ex1_op1_zero                = ex1_expnt1_zero && 
+                                     ex1_frac1_all0  &&
+                                    ~ex1_op1_cnan;
+//denormalize number
+assign ex1_op0_id                  =  ex1_expnt0_zero && 
+                                     ~ex1_frac0_all0  &&
+                                     ~ex1_op0_cnan;
+assign ex1_op1_id                  =  ex1_expnt1_zero && 
+                                     ~ex1_frac1_all0  &&
+                                     ~ex1_op1_cnan;
+//assign ex1_op0_id_fm1              =  vfpu_yy_xx_fm[1]  &&
+//                                      vfpu_yy_xx_fm[0]  && 
+//                                      ex1_op0_id;
+//assign ex1_op1_id_fm1              =  vfpu_yy_xx_fm[1]  &&
+//                                      vfpu_yy_xx_fm[0]  && 
+//                                      ex1_op1_id;
+//assign ex1_op0_id_fm0              =  vfpu_yy_xx_fm[1]   &&
+//                                      !vfpu_yy_xx_fm[0]  && 
+//                                      ex1_op0_id;
+//assign ex1_op1_id_fm0              =  vfpu_yy_xx_fm[1]   &&
+//                                      !vfpu_yy_xx_fm[0]  && 
+//                                      ex1_op1_id;
+assign ex1_op0_id_nor              = ex1_op0_id;
+assign ex1_op1_id_nor              = ex1_op1_id;
+
+//cNaN
+assign ex1_op0_cnan                =  ex1_scalar  &&
+                                      !ex1_double &&
+                                      !ex1_oper0_high_all1;
+                                      
+assign ex1_op1_cnan                =  ex1_scalar  && 
+                                      !ex1_double &&
+                                      !ex1_oper1_high_all1;
+
+//sNaN
+assign ex1_op0_snan                =  ex1_expnt0_max &&
+                                     ~ex1_frac0_all0 &&
+                                     ~ex1_frac0_msb  &&
+                                     ~ex1_op0_cnan;
+assign ex1_op1_snan                =  ex1_expnt1_max &&
+                                     ~ex1_frac1_all0 &&
+                                     ~ex1_frac1_msb  &&
+                                     ~ex1_op1_cnan;
+
+//qNaN
+assign ex1_op0_qnan                = (ex1_expnt0_max && 
+                                      ex1_frac0_msb) ||
+                                      ex1_op0_cnan;
+assign ex1_op1_qnan                = (ex1_expnt1_max && 
+                                      ex1_frac1_msb) ||
+                                      ex1_op1_cnan;
+//=====================find first one=======================
+// this is for the denormal number
+// &Instance("ct_vfdsu_ff1","x_frac0_expnt"); @150
+ct_vfdsu_ff1  x_frac0_expnt (
+  .fanc_shift_num           (ex1_oper0_id_frac[51:0] ),
+  .frac_bin_val             (ex1_oper0_id_expnt[12:0]),
+  .frac_num                 (ex1_oper0_frac[51:0]    )
+);
+
+// &Connect(.frac_num(ex1_oper0_frac[51:0])); @151
+// &Connect(.frac_bin_val(ex1_oper0_id_expnt[12:0])); @152
+// &Connect(.fanc_shift_num(ex1_oper0_id_frac[51:0])); @153
+
+// &Instance("ct_vfdsu_ff1","x_frac1_expnt"); @155
+ct_vfdsu_ff1  x_frac1_expnt (
+  .fanc_shift_num           (ex1_oper1_id_frac[51:0] ),
+  .frac_bin_val             (ex1_oper1_id_expnt[12:0]),
+  .frac_num                 (ex1_oper1_frac[51:0]    )
+);
+
+// &Connect(.frac_num(ex1_oper1_frac[51:0])); @156
+// &Connect(.frac_bin_val(ex1_oper1_id_expnt[12:0])); @157
+// &Connect(.fanc_shift_num(ex1_oper1_id_frac[51:0])); @158
+assign ex1_oper0_frac[51:0] = ex1_double ? ex1_oper0[51:0] :
+                                           ex1_single ? {ex1_oper0[22:0],29'b0}
+                                                      : {ex1_oper0[9:0],42'b0};
+assign ex1_oper1_frac[51:0] = ex1_double ? ex1_oper1[51:0] :
+                                           ex1_single ? {ex1_oper1[22:0],29'b0}
+                                                      : {ex1_oper1[9:0],42'b0};
+//=====================exponent add=========================
+//exponent number 0
+assign ex1_div_op0_expnt[12:0]     = ex1_double ? {2'b0,ex1_oper0[62:52]} : 
+                                                  ex1_single ? {5'b0,ex1_oper0[30:23]}
+                                                             : {8'b0,ex1_oper0[14:10]};
+assign ex1_expnt_adder_op0[12:0]   = ex1_op0_id_nor ? ex1_oper0_id_expnt[12:0]
+                                                    : ex1_div_op0_expnt[12:0];
+//exponent number 1
+assign ex1_div_op1_expnt[12:0]  = ex1_double ? {2'b0,ex1_oper1[62:52]} :
+                                               ex1_single ? {5'b0,ex1_oper1[30:23]}
+                                                          : {8'b0,ex1_oper1[14:10]};
+assign ex1_sqrt_op1_expnt[12:0] = ex1_double ? {3'b0,{10{1'b1}}} : //'d1023
+                                               ex1_single ? {6'b0,{7{1'b1}}} //'d127
+                                                          : {9'b0,{4{1'b1}}}; //'d15
+  
+// &CombBeg;  @180
+always @( ex1_oper1_id_expnt[12:0]
+       or ex1_div
+       or ex1_op1_id_nor
+       or ex1_sqrt_op1_expnt[12:0]
+       or ex1_sqrt
+       or ex1_div_op1_expnt[12:0])
+begin
+case({ex1_div,ex1_sqrt})
+  2'b10:   ex1_expnt_adder_op1[12:0] = ex1_op1_id_nor ? ex1_oper1_id_expnt[12:0]
+                                                  : ex1_div_op1_expnt[12:0];
+  2'b01:   ex1_expnt_adder_op1[12:0] = ex1_sqrt_op1_expnt[12:0];
+  default: ex1_expnt_adder_op1[12:0] = 13'b0;
+endcase
+// &CombEnd; @187
+end
+//expnt0 sub expnt1
+assign ex1_sqrt_expnt_result_odd =  ex1_expnt_adder_op0[0] ^ ex1_expnt_adder_op1[0];
+
+
+//======================EX1 expt detect=====================
+//ex1_id_detect
+//any opration is zero
+// no input denormalize exception anymore
+//
+//ex1_nv_detect
+//div_nv
+//  1.any operation is sNaN
+//  2.0/0(include DN flush to zero)
+//  3.inf/inf
+//sqrt_nv
+//  1.any operation is sNaN
+//  2.operation sign is 1 && operation is not zero/qNaN
+assign ex1_nv      = ex1_div  && ex1_div_nv  || 
+                     ex1_sqrt && ex1_sqrt_nv;
+//ex1_div_nv
+assign ex1_div_nv  = ex1_op0_snan || 
+                     ex1_op1_snan || 
+                    (ex1_op0_tt_zero && ex1_op1_tt_zero)|| 
+                    (ex1_op0_inf && ex1_op1_inf);
+assign ex1_op0_tt_zero = ex1_op0_zero;
+assign ex1_op1_tt_zero = ex1_op1_zero;
+//ex1_sqrt_nv
+assign ex1_sqrt_nv = ex1_op0_snan || 
+                     ex1_op0_sign && 
+                    (ex1_op0_norm || 
+                     ex1_op0_inf );
+assign ex1_op0_norm = !ex1_expnt0_zero && !ex1_expnt0_max && !ex1_op0_cnan || ex1_op0_id_nor ;
+assign ex1_op1_norm = !ex1_expnt1_zero && !ex1_expnt1_max && !ex1_op1_cnan || ex1_op1_id_nor; 
+
+//ex1_of_detect
+//div_of
+//  1.only detect id overflow case
+//assign ex1_of      = ex1_div && ex1_div_of;
+//assign ex1_div_of  = ex1_op1_id_fm1 && 
+//                     ex1_op0_norm && 
+//                     ex1_div_id_of;
+//
+////ex1_uf_detect
+////div_uf
+////  1.only detect id underflow case
+//assign ex1_uf      = ex1_div && ex1_div_uf;
+//assign ex1_div_uf  = ex1_op0_id && 
+//                     ex1_op1_norm && 
+//                     ex1_div_id_uf;
+//ex1_dz_detect
+//div_dz
+//  1.op0 is normal && op1 zero
+assign ex1_dz      = ex1_div && ex1_div_dz;
+assign ex1_div_dz  = ex1_op1_tt_zero && ex1_op0_norm;
+
+//===================sqrt exponent prepare==================
+//sqrt exponent prepare
+//afert E sub, div E by 2
+//assign ex1_sqrt_expnt_result[12:0] = {ex1_expnt_result[12],
+//                                      ex1_expnt_result[12:1]};
+//ex1_sqrt_expnt_odd
+//fraction will shift left by 1
+assign ex1_sqrt_expnt_odd          = ex1_sqrt_expnt_result_odd;
+
+//===================special cal result=====================
+//ex1 result is zero
+//div_zero
+//  1.op0 is zero && op1 is normal
+//  2.op0 is zero/normal && op1 is inf
+//sqrt_zero
+//  1.op0 is zero
+assign ex1_result_zero   = ex1_div_rst_zero  && ex1_div  || 
+                           ex1_sqrt_rst_zero && ex1_sqrt;
+assign ex1_div_rst_zero  = (ex1_op0_tt_zero && ex1_op1_norm ) || 
+                           (!ex1_expnt0_max && !ex1_op0_cnan && ex1_op1_inf);
+assign ex1_sqrt_rst_zero = ex1_op0_tt_zero;
+
+//ex1 result is qNaN
+//ex1_nv
+//div_qnan
+//  1.op0 is qnan || op1 is qnan
+//sqrt_qnan
+//  1.op0 is qnan
+assign ex1_result_qnan   = ex1_div_rst_qnan  && ex1_div  || 
+                           ex1_sqrt_rst_qnan && ex1_sqrt || 
+                           ex1_nv;
+assign ex1_div_rst_qnan  = ex1_op0_qnan || 
+                           ex1_op1_qnan;
+assign ex1_sqrt_rst_qnan = ex1_op0_qnan;
+
+//ex1_rst_default_qnan
+//0/0, inf/inf, sqrt negative should get default qNaN
+assign ex1_rst_default_qnan = (ex1_div && ex1_op0_zero && ex1_op1_zero) || 
+                              (ex1_div && ex1_op0_inf  && ex1_op1_inf)  || 
+                              (ex1_sqrt&& ex1_op0_sign && (ex1_op0_norm || ex1_op0_inf));
+
+//ex1 result is inf
+//ex1_dz
+//
+//div_inf
+//  1.op0 is inf && op1 is normal/zero
+//sqrt_inf
+//  1.op0 is inf
+assign ex1_result_inf    = ex1_div_rst_inf  && ex1_div  || 
+                           ex1_sqrt_rst_inf && ex1_sqrt || 
+                           ex1_dz ;
+assign ex1_div_rst_inf   = ex1_op0_inf && !ex1_expnt1_max && !ex1_op1_cnan;
+assign ex1_sqrt_rst_inf  = ex1_op0_inf && !ex1_op0_sign;
+
+//ex1 result is lfn
+//ex1_of && round result toward not inc 1
+
+assign ex1_rm[2:0]       = ((ex1_static_rm[2:0] == 3'b111)|| !ex1_scalar)
+                         ? vfpu_yy_xx_rm[2:0]
+                         : ex1_static_rm[2:0];
+//RNE : Always inc 1 because round to nearest of 1.111...11
+//RTZ : Always not inc 1
+//RUP : Always not inc 1 when posetive
+//RDN : Always not inc 1 when negative
+//RMM : Always inc 1 because round to max magnitude
+// &CombBeg; @308
+always @( ex1_rm[2:0]
+       or ex1_result_sign)
+begin
+case(ex1_rm[2:0])
+  3'b000  : ex1_of_result_lfn = 1'b0;
+  3'b001  : ex1_of_result_lfn = 1'b1;
+  3'b010  : ex1_of_result_lfn = !ex1_result_sign;
+  3'b011  : ex1_of_result_lfn = ex1_result_sign;
+  3'b100  : ex1_of_result_lfn = 1'b0;
+  default: ex1_of_result_lfn = 1'b0;
+endcase
+// &CombEnd; @317
+end
+
+//EX1 Remainder
+//div  : 1/8  <= x < 1/4
+//sqrt : 1/16 <= x < 1/4
+assign ex1_remainder[59:0] = {60{ex1_div }} & {5'b0,ex1_div_srt_op0[52:0],2'b0} | 
+                             {60{ex1_sqrt}} & sqrt_remainder[59:0];
+
+//EX1 Divisor
+//1/2 <= y < 1
+assign ex1_divisor[52:0]   = ex1_div_srt_op1[52:0];
+
+//ex1_div_srt_op0
+assign ex1_div_srt_op0[52:0]     = ex1_div_nor_srt_op0[52:0];
+//ex1_div_srt_op1
+assign ex1_div_srt_op1[52:0]     =  ex1_div_nor_srt_op1[52:0];
+//ex1_div_nor_srt_op0
+assign ex1_div_noid_nor_srt_op0[52:0] = ex1_double ? {1'b1,ex1_oper0[51:0]} :
+                                                     ex1_single ? {1'b1,ex1_oper0[22:0],29'b0}
+                                                                : {1'b1,ex1_oper0[9:0],42'b0};
+assign ex1_div_noid_nor_srt_op1[52:0] = ex1_double ? {1'b1,ex1_oper1[51:0]} :
+                                                     ex1_single ? {1'b1,ex1_oper1[22:0],29'b0}
+                                                                : {1'b1,ex1_oper1[9:0],42'b0};
+assign ex1_div_nor_srt_op0[52:0] = ex1_op0_id_nor ? {ex1_oper0_id_frac[51:0],1'b0} 
+                                                  : ex1_div_noid_nor_srt_op0[52:0];
+//ex1_div_nor_srt_op1
+assign ex1_div_nor_srt_op1[52:0] = ex1_op1_id_nor ? {ex1_oper1_id_frac[51:0],1'b0} 
+                                                  : ex1_div_noid_nor_srt_op1[52:0];
+//sqrt_remainder
+assign sqrt_remainder[59:0]      = (ex1_sqrt_expnt_odd)
+                                 ? {5'b0,ex1_sqrt_srt_op0[52:0],2'b0}
+                                 : {6'b0,ex1_sqrt_srt_op0[52:0],1'b0};
+//ex1_sqrt_srt_op0
+assign ex1_sqrt_srt_op0[52:0]    = ex1_div_srt_op0[52:0];
+
+//Default_qnan/Standard_qnan Select
+assign ex1_op0_is_snan      = ex1_op0_snan;
+assign ex1_op1_is_snan      = ex1_op1_snan && ex1_div;
+assign ex1_op0_is_qnan      = ex1_op0_qnan;
+assign ex1_op1_is_qnan      = ex1_op1_qnan && ex1_div;
+assign ex1_op0_f[51:0]      = (ex1_op0_cnan) ? 52'b0: ex1_oper0[51:0];
+assign ex1_op1_f[51:0]      = (ex1_op1_cnan) ? 52'b0: ex1_oper1[51:0];
+// &CombBeg; @359
+always @( ex1_op0_is_snan
+       or ex1_op0_is_qnan
+       or ex1_op0_f[51:0]
+       or ex1_rst_default_qnan
+       or ex1_op1_f[51:0]
+       or vfpu_yy_xx_dqnan
+       or ex1_op1_is_snan
+       or ex1_op1_is_qnan)
+begin
+if(ex1_rst_default_qnan)
+  ex1_qnan_f[51:0] = {1'b1, 51'b0};
+else if(ex1_op0_is_snan && vfpu_yy_xx_dqnan)
+  ex1_qnan_f[51:0] = ex1_op0_f[51:0];
+else if(ex1_op1_is_snan && vfpu_yy_xx_dqnan)
+  ex1_qnan_f[51:0] = ex1_op1_f[51:0];
+else if(ex1_op0_is_qnan && vfpu_yy_xx_dqnan)
+  ex1_qnan_f[51:0] = ex1_op0_f[51:0];
+else if(ex1_op1_is_qnan && vfpu_yy_xx_dqnan)
+  ex1_qnan_f[51:0] = ex1_op1_f[51:0];
+else
+  ex1_qnan_f[51:0] = {1'b1, 51'b0};
+// &CombEnd; @372
+end
+
+// &CombBeg; @374
+always @( ex1_op0_is_snan
+       or ex1_op0_cnan
+       or ex1_op0_is_qnan
+       or ex1_op1_sign
+       or ex1_op0_sign
+       or ex1_rst_default_qnan
+       or vfpu_yy_xx_dqnan
+       or ex1_op1_cnan
+       or ex1_op1_is_snan
+       or ex1_op1_is_qnan)
+begin
+if(ex1_rst_default_qnan)
+  ex1_qnan_sign = 1'b0;
+else if(ex1_op0_is_snan && vfpu_yy_xx_dqnan)
+  ex1_qnan_sign = ex1_op0_sign;
+else if(ex1_op1_is_snan && vfpu_yy_xx_dqnan)
+  ex1_qnan_sign = ex1_op1_sign;
+else if(ex1_op0_is_qnan && vfpu_yy_xx_dqnan)
+  ex1_qnan_sign = ex1_op0_sign && !ex1_op0_cnan;
+else if(ex1_op1_is_qnan && vfpu_yy_xx_dqnan)
+  ex1_qnan_sign = ex1_op1_sign && !ex1_op1_cnan;
+else
+  ex1_qnan_sign = 1'b0;
+// &CombEnd; @387
+end
+
+
+//========================Pipe to EX2=======================
+//exponent register cal result
+//assign ex1_srt_expnt_rst[12:0] = (ex1_sqrt)
+//                               ? ex1_sqrt_expnt_result[12:0]
+//                               : ex1_expnt_result[12:0];
+//Special result should skip SRT logic
+assign ex1_srt_skip = ex1_result_zero || 
+                      ex1_result_qnan || 
+                      ex1_result_inf;
+//gate clk
+// &Instance("gated_clk_cell","x_ex1_pipe_clk"); @400
+gated_clk_cell  x_ex1_pipe_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex1_pipe_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex1_pipe_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @401
+//           .clk_out        (ex1_pipe_clk),//Out Clock @402
+//           .external_en    (1'b0), @403
+//           .global_en      (cp0_yy_clk_en), @404
+//           .local_en       (ex1_pipe_clk_en),//Local Condition @405
+//           .module_en      (cp0_vfpu_icg_en) @406
+//         ); @407
+assign ex1_pipe_clk_en = ex1_pipedown;
+
+always @(posedge ex1_pipe_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex2_result_zero     <=  1'b0; 
+    vfdsu_ex2_result_qnan     <=  1'b0; 
+    vfdsu_ex2_result_inf      <=  1'b0; 
+    vfdsu_ex2_result_sign     <=  1'b0; 
+    vfdsu_ex2_op0_norm        <=  1'b0; 
+    vfdsu_ex2_op1_norm        <=  1'b0; 
+    vfdsu_ex2_expnt_add0[12:0] <= 13'b0; 
+    vfdsu_ex2_expnt_add1[12:0] <= 13'b0; 
+    vfdsu_ex2_nv              <=  1'b0; 
+    vfdsu_ex2_dz              <=  1'b0; 
+    vfdsu_ex2_srt_skip        <=  1'b0; 
+    vfdsu_ex2_of_rm_lfn       <=  1'b0;
+    vfdsu_ex2_qnan_sign       <=  1'b0;
+    vfdsu_ex2_qnan_f[51:0]    <= 52'b0;
+    vfdsu_ex2_rm[2:0]         <=  3'b0;
+    vfdsu_ex2_div             <=  1'b0;
+    vfdsu_ex2_sqrt            <=  1'b0;
+    vfdsu_ex2_double          <=  1'b0;
+    vfdsu_ex2_single          <=  1'b0;
+  end
+  else if(ex1_pipedown)
+  begin
+    vfdsu_ex2_result_zero     <= ex1_result_zero; 
+    vfdsu_ex2_result_qnan     <= ex1_result_qnan; 
+    vfdsu_ex2_result_inf      <= ex1_result_inf; 
+    vfdsu_ex2_result_sign     <= ex1_result_sign; 
+    vfdsu_ex2_op0_norm        <= ex1_op0_norm; 
+    vfdsu_ex2_op1_norm        <= ex1_op1_norm; 
+    vfdsu_ex2_expnt_add0[12:0] <= ex1_expnt_adder_op0[12:0];
+    vfdsu_ex2_expnt_add1[12:0] <= ex1_expnt_adder_op1[12:0];
+    vfdsu_ex2_nv              <= ex1_nv; 
+    vfdsu_ex2_dz              <= ex1_dz; 
+    vfdsu_ex2_srt_skip        <= ex1_srt_skip; 
+    vfdsu_ex2_of_rm_lfn       <= ex1_of_result_lfn;
+    vfdsu_ex2_qnan_sign       <= ex1_qnan_sign;
+    vfdsu_ex2_qnan_f[51:0]    <= ex1_qnan_f[51:0];
+    vfdsu_ex2_rm[2:0]         <= ex1_rm[2:0];
+    vfdsu_ex2_div             <= ex1_div;
+    vfdsu_ex2_sqrt            <= ex1_sqrt;
+    vfdsu_ex2_double          <= ex1_double;
+    vfdsu_ex2_single          <= ex1_single;
+  end
+  else
+  begin
+    vfdsu_ex2_result_zero     <= vfdsu_ex2_result_zero; 
+    vfdsu_ex2_result_qnan     <= vfdsu_ex2_result_qnan; 
+    vfdsu_ex2_result_inf      <= vfdsu_ex2_result_inf; 
+    vfdsu_ex2_result_sign     <= vfdsu_ex2_result_sign; 
+    vfdsu_ex2_op0_norm        <= vfdsu_ex2_op0_norm; 
+    vfdsu_ex2_op1_norm        <= vfdsu_ex2_op1_norm; 
+    vfdsu_ex2_expnt_add0[12:0] <= vfdsu_ex2_expnt_add0[12:0]; 
+    vfdsu_ex2_expnt_add1[12:0] <= vfdsu_ex2_expnt_add1[12:0]; 
+    vfdsu_ex2_nv              <= vfdsu_ex2_nv; 
+    vfdsu_ex2_dz              <= vfdsu_ex2_dz; 
+    vfdsu_ex2_srt_skip        <= vfdsu_ex2_srt_skip; 
+    vfdsu_ex2_of_rm_lfn       <= vfdsu_ex2_of_rm_lfn;
+    vfdsu_ex2_qnan_sign       <= vfdsu_ex2_qnan_sign;
+    vfdsu_ex2_qnan_f[51:0]    <= vfdsu_ex2_qnan_f[51:0];
+    vfdsu_ex2_rm[2:0]         <= vfdsu_ex2_rm[2:0];
+    vfdsu_ex2_div             <= vfdsu_ex2_div;
+    vfdsu_ex2_sqrt            <= vfdsu_ex2_sqrt;
+    vfdsu_ex2_double          <= vfdsu_ex2_double;
+    vfdsu_ex2_single          <= vfdsu_ex2_single;
+  end
+end
+
+// &Force("output","vfdsu_ex2_op0_norm"); @480
+// &Force("output","vfdsu_ex2_op1_norm"); @481
+// &Force("output","vfdsu_ex2_dz"); @482
+// &Force("output","vfdsu_ex2_nv"); @483
+// &Force("output","vfdsu_ex2_srt_skip"); @484
+// &Force("output","vfdsu_ex2_of_rm_lfn"); @485
+// &Force("output","vfdsu_ex2_result_inf"); @486
+// &Force("output","vfdsu_ex2_result_qnan"); @487
+// &Force("output","vfdsu_ex2_result_zero"); @488
+// //&Force("output","vfdsu_ex2_expnt_rst"); @489
+// &Force("output","vfdsu_ex2_result_sign"); @490
+// &Force("output","vfdsu_ex2_qnan_f"); @491
+// &Force("output","vfdsu_ex2_qnan_sign"); @492
+// &Force("output","vfdsu_ex2_rm"); @493
+// &Force("output","vfdsu_ex2_div"); @494
+// &Force("output","vfdsu_ex2_sqrt"); @495
+// &Force("output","vfdsu_ex2_double"); @496
+// &Force("output","vfdsu_ex2_single"); @497
+// &Force("output","vfdsu_ex2_expnt_add0"); @498
+// &Force("output","vfdsu_ex2_expnt_add1"); @499
+
+// &ModuleEnd; @501
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
new file mode 100644
index 00000000..6eece526
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
@@ -0,0 +1,1041 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_round(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  ex3_pipedown,
+  forever_cpuclk,
+  pad_yy_icg_scan_en,
+  total_qt_rt_58,
+  vfdsu_ex2_of_rm_lfn,
+  vfdsu_ex3_doub_expnt_rst,
+  vfdsu_ex3_double,
+  vfdsu_ex3_dz,
+  vfdsu_ex3_half_expnt_rst,
+  vfdsu_ex3_id_srt_skip,
+  vfdsu_ex3_nv,
+  vfdsu_ex3_of,
+  vfdsu_ex3_potnt_of,
+  vfdsu_ex3_potnt_uf,
+  vfdsu_ex3_qnan_f,
+  vfdsu_ex3_qnan_sign,
+  vfdsu_ex3_rem_sign,
+  vfdsu_ex3_rem_zero,
+  vfdsu_ex3_result_denorm_round_add_num,
+  vfdsu_ex3_result_inf,
+  vfdsu_ex3_result_lfn,
+  vfdsu_ex3_result_qnan,
+  vfdsu_ex3_result_sign,
+  vfdsu_ex3_result_zero,
+  vfdsu_ex3_rm,
+  vfdsu_ex3_rslt_denorm,
+  vfdsu_ex3_sing_expnt_rst,
+  vfdsu_ex3_single,
+  vfdsu_ex3_uf,
+  vfdsu_ex4_denorm_to_tiny_frac,
+  vfdsu_ex4_double,
+  vfdsu_ex4_dz,
+  vfdsu_ex4_expnt_rst,
+  vfdsu_ex4_frac,
+  vfdsu_ex4_nv,
+  vfdsu_ex4_nx,
+  vfdsu_ex4_of,
+  vfdsu_ex4_of_rst_lfn,
+  vfdsu_ex4_potnt_norm,
+  vfdsu_ex4_potnt_of,
+  vfdsu_ex4_potnt_uf,
+  vfdsu_ex4_qnan_f,
+  vfdsu_ex4_qnan_sign,
+  vfdsu_ex4_result_inf,
+  vfdsu_ex4_result_lfn,
+  vfdsu_ex4_result_nor,
+  vfdsu_ex4_result_qnan,
+  vfdsu_ex4_result_sign,
+  vfdsu_ex4_result_zero,
+  vfdsu_ex4_rslt_denorm,
+  vfdsu_ex4_single,
+  vfdsu_ex4_uf
+);
+
+// &Ports; @23
+input           cp0_vfpu_icg_en;                      
+input           cp0_yy_clk_en;                        
+input           cpurst_b;                             
+input           ex3_pipedown;                         
+input           forever_cpuclk;                       
+input           pad_yy_icg_scan_en;                   
+input   [57:0]  total_qt_rt_58;                       
+input           vfdsu_ex2_of_rm_lfn;                  
+input   [12:0]  vfdsu_ex3_doub_expnt_rst;             
+input           vfdsu_ex3_double;                     
+input           vfdsu_ex3_dz;                         
+input   [12:0]  vfdsu_ex3_half_expnt_rst;             
+input           vfdsu_ex3_id_srt_skip;                
+input           vfdsu_ex3_nv;                         
+input           vfdsu_ex3_of;                         
+input           vfdsu_ex3_potnt_of;                   
+input           vfdsu_ex3_potnt_uf;                   
+input   [51:0]  vfdsu_ex3_qnan_f;                     
+input           vfdsu_ex3_qnan_sign;                  
+input           vfdsu_ex3_rem_sign;                   
+input           vfdsu_ex3_rem_zero;                   
+input   [52:0]  vfdsu_ex3_result_denorm_round_add_num; 
+input           vfdsu_ex3_result_inf;                 
+input           vfdsu_ex3_result_lfn;                 
+input           vfdsu_ex3_result_qnan;                
+input           vfdsu_ex3_result_sign;                
+input           vfdsu_ex3_result_zero;                
+input   [2 :0]  vfdsu_ex3_rm;                         
+input           vfdsu_ex3_rslt_denorm;                
+input   [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+input           vfdsu_ex3_single;                     
+input           vfdsu_ex3_uf;                         
+output          vfdsu_ex4_denorm_to_tiny_frac;        
+output          vfdsu_ex4_double;                     
+output          vfdsu_ex4_dz;                         
+output  [12:0]  vfdsu_ex4_expnt_rst;                  
+output  [54:0]  vfdsu_ex4_frac;                       
+output          vfdsu_ex4_nv;                         
+output          vfdsu_ex4_nx;                         
+output          vfdsu_ex4_of;                         
+output          vfdsu_ex4_of_rst_lfn;                 
+output  [1 :0]  vfdsu_ex4_potnt_norm;                 
+output          vfdsu_ex4_potnt_of;                   
+output          vfdsu_ex4_potnt_uf;                   
+output  [51:0]  vfdsu_ex4_qnan_f;                     
+output          vfdsu_ex4_qnan_sign;                  
+output          vfdsu_ex4_result_inf;                 
+output          vfdsu_ex4_result_lfn;                 
+output          vfdsu_ex4_result_nor;                 
+output          vfdsu_ex4_result_qnan;                
+output          vfdsu_ex4_result_sign;                
+output          vfdsu_ex4_result_zero;                
+output          vfdsu_ex4_rslt_denorm;                
+output          vfdsu_ex4_single;                     
+output          vfdsu_ex4_uf;                         
+
+// &Regs; @24
+reg             denorm_to_tiny_frac;                  
+reg             double_denorm_lst_frac;               
+reg     [54:0]  frac_add1_op1;                        
+reg             frac_add_1;                           
+reg             frac_orig;                            
+reg     [54:0]  frac_sub1_op1;                        
+reg             frac_sub_1;                           
+reg             half_denorm_lst_frac;                 
+reg     [56:0]  qt_result_double_denorm_for_round;    
+reg     [13:0]  qt_result_half_denorm_for_round;      
+reg     [27:0]  qt_result_single_denorm_for_round;    
+reg             single_denorm_lst_frac;               
+reg             vfdsu_ex4_denorm_to_tiny_frac;        
+reg             vfdsu_ex4_double;                     
+reg             vfdsu_ex4_dz;                         
+reg     [12:0]  vfdsu_ex4_expnt_rst;                  
+reg     [54:0]  vfdsu_ex4_frac;                       
+reg             vfdsu_ex4_nv;                         
+reg             vfdsu_ex4_nx;                         
+reg             vfdsu_ex4_of;                         
+reg             vfdsu_ex4_of_rst_lfn;                 
+reg     [1 :0]  vfdsu_ex4_potnt_norm;                 
+reg             vfdsu_ex4_potnt_of;                   
+reg             vfdsu_ex4_potnt_uf;                   
+reg     [51:0]  vfdsu_ex4_qnan_f;                     
+reg             vfdsu_ex4_qnan_sign;                  
+reg             vfdsu_ex4_result_inf;                 
+reg             vfdsu_ex4_result_lfn;                 
+reg             vfdsu_ex4_result_nor;                 
+reg             vfdsu_ex4_result_qnan;                
+reg             vfdsu_ex4_result_sign;                
+reg             vfdsu_ex4_result_zero;                
+reg             vfdsu_ex4_rslt_denorm;                
+reg             vfdsu_ex4_single;                     
+reg             vfdsu_ex4_uf;                         
+
+// &Wires; @25
+wire            cp0_vfpu_icg_en;                      
+wire            cp0_yy_clk_en;                        
+wire            cpurst_b;                             
+wire            ex3_denorm_eq;                        
+wire            ex3_denorm_gr;                        
+wire            ex3_denorm_lst_frac;                  
+wire            ex3_denorm_nx;                        
+wire            ex3_denorm_plus;                      
+wire            ex3_denorm_potnt_norm;                
+wire            ex3_denorm_zero;                      
+wire            ex3_doub_denorm_plus;                 
+wire            ex3_doub_denorm_potnt_norm;           
+wire            ex3_doub_eq;                          
+wire            ex3_doub_gr;                          
+wire            ex3_doub_rst_eq_1;                    
+wire            ex3_doub_zero;                        
+wire            ex3_double_denorm_eq;                 
+wire            ex3_double_denorm_gr;                 
+wire            ex3_double_denorm_zero;               
+wire            ex3_double_low_not_zero;              
+wire    [12:0]  ex3_expnt_adjst;                      
+wire    [12:0]  ex3_expnt_adjust_result;              
+wire            ex3_half_denorm_eq;                   
+wire            ex3_half_denorm_gr;                   
+wire            ex3_half_denorm_plus;                 
+wire            ex3_half_denorm_potnt_norm;           
+wire            ex3_half_denorm_zero;                 
+wire            ex3_half_eq;                          
+wire            ex3_half_gr;                          
+wire            ex3_half_low_not_zero;                
+wire            ex3_half_rst_eq_1;                    
+wire            ex3_half_zero;                        
+wire            ex3_nx;                               
+wire            ex3_pipe_clk;                         
+wire            ex3_pipe_clk_en;                      
+wire            ex3_pipedown;                         
+wire    [1 :0]  ex3_potnt_norm;                       
+wire            ex3_qt_doub_lo2_not0;                 
+wire            ex3_qt_doub_lo3_not0;                 
+wire            ex3_qt_eq;                            
+wire            ex3_qt_gr;                            
+wire            ex3_qt_half_lo2_not0;                 
+wire            ex3_qt_half_lo3_not0;                 
+wire            ex3_qt_sing_lo3_not0;                 
+wire            ex3_qt_sing_lo4_not0;                 
+wire            ex3_qt_zero;                          
+wire            ex3_rslt_denorm;                      
+wire            ex3_rst_eq_1;                         
+wire            ex3_rst_nor;                          
+wire            ex3_sing_denorm_plus;                 
+wire            ex3_sing_denorm_potnt_norm;           
+wire            ex3_sing_eq;                          
+wire            ex3_sing_gr;                          
+wire            ex3_sing_rst_eq_1;                    
+wire            ex3_sing_zero;                        
+wire            ex3_single_denorm_eq;                 
+wire            ex3_single_denorm_gr;                 
+wire            ex3_single_denorm_zero;               
+wire            ex3_single_low_not_zero;              
+wire            forever_cpuclk;                       
+wire    [54:0]  frac_add1_op1_with_denorm;            
+wire    [54:0]  frac_add1_rst;                        
+wire            frac_denorm_rdn_add_1;                
+wire            frac_denorm_rdn_sub_1;                
+wire            frac_denorm_rmm_add_1;                
+wire            frac_denorm_rne_add_1;                
+wire            frac_denorm_rtz_sub_1;                
+wire            frac_denorm_rup_add_1;                
+wire            frac_denorm_rup_sub_1;                
+wire    [54:0]  frac_final_rst;                       
+wire            frac_rdn_add_1;                       
+wire            frac_rdn_sub_1;                       
+wire            frac_rmm_add_1;                       
+wire            frac_rne_add_1;                       
+wire            frac_rtz_sub_1;                       
+wire            frac_rup_add_1;                       
+wire            frac_rup_sub_1;                       
+wire    [54:0]  frac_sub1_op1_with_denorm;            
+wire    [54:0]  frac_sub1_rst;                        
+wire            pad_yy_icg_scan_en;                   
+wire    [57:0]  total_qt_rt_58;                       
+wire            vfdsu_ex2_of_rm_lfn;                  
+wire    [12:0]  vfdsu_ex3_doub_expnt_rst;             
+wire            vfdsu_ex3_double;                     
+wire            vfdsu_ex3_dz;                         
+wire    [12:0]  vfdsu_ex3_expnt_rst;                  
+wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
+wire            vfdsu_ex3_id_srt_skip;                
+wire            vfdsu_ex3_nv;                         
+wire            vfdsu_ex3_of;                         
+wire            vfdsu_ex3_potnt_of;                   
+wire            vfdsu_ex3_potnt_uf;                   
+wire    [51:0]  vfdsu_ex3_qnan_f;                     
+wire            vfdsu_ex3_qnan_sign;                  
+wire            vfdsu_ex3_rem_sign;                   
+wire            vfdsu_ex3_rem_zero;                   
+wire    [52:0]  vfdsu_ex3_result_denorm_round_add_num; 
+wire            vfdsu_ex3_result_inf;                 
+wire            vfdsu_ex3_result_lfn;                 
+wire            vfdsu_ex3_result_qnan;                
+wire            vfdsu_ex3_result_sign;                
+wire            vfdsu_ex3_result_zero;                
+wire    [2 :0]  vfdsu_ex3_rm;                         
+wire            vfdsu_ex3_rslt_denorm;                
+wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+wire            vfdsu_ex3_single;                     
+wire            vfdsu_ex3_uf;                         
+
+
+//=======================Round Rule=========================
+//1/8 <= x < 1/4, 1/2 <= y < 1, => 1/8 < z < 1/2
+//q[57:0] represent the fraction part result of quotient, q[57] for 1/2
+//Thus the first "1" in 58 bit quotient will be in q[56] or q[55]
+//For Double Float
+//29 round to get 58 bit quotient, 52+1 bit as valid result, other for round
+//if q[56] is 1, q[56:4] as 1.xxxx valid result, [3:0] for round
+//if q[56] is 0, q[55:3] as 1.xxxx valid result, [2:0] for round
+//For Single Float
+//15 round to get 30 bit quotient, 23+1 bit as valid result, other for round
+//if q[56] is 1, q[56:33] as 1.xxxx valid result, [32:28] for round
+//if q[56] is 0, q[55:32] as 1.xxxx valid result, [31:28] for round
+assign ex3_qt_half_lo3_not0 = |total_qt_rt_58[44:42];
+assign ex3_qt_half_lo2_not0 = |total_qt_rt_58[43:42];
+assign ex3_half_gr       = total_qt_rt_58[56] 
+                              ? total_qt_rt_58[45] && ex3_qt_half_lo3_not0
+                              : total_qt_rt_58[44] && ex3_qt_half_lo2_not0;
+assign ex3_half_eq          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[45] && !ex3_qt_sing_lo4_not0 
+                            :  total_qt_rt_58[44] && !ex3_qt_sing_lo3_not0;
+assign ex3_half_zero        = (total_qt_rt_58[56])
+                            ? ~|total_qt_rt_58[45:42]
+                            : ~|total_qt_rt_58[44:42];
+assign ex3_half_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:46];       
+assign ex3_half_denorm_plus = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff2);
+assign ex3_half_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff1);
+assign vfdsu_ex3_expnt_rst[12:0]  = vfdsu_ex3_half_expnt_rst[12:0];
+// &Force("bus","total_qt_rt_58",57,0); @54
+assign ex3_qt_doub_lo3_not0 = |total_qt_rt_58[2:0]; 
+assign ex3_qt_doub_lo2_not0 = |total_qt_rt_58[1:0]; 
+assign ex3_qt_sing_lo4_not0 = |total_qt_rt_58[31:28];
+assign ex3_qt_sing_lo3_not0 = |total_qt_rt_58[30:28];
+//the quotient round bits great than "10000"(ronnd bits 10..0)
+assign ex3_doub_gr          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[3] && ex3_qt_doub_lo3_not0
+                            :  total_qt_rt_58[2] && ex3_qt_doub_lo2_not0;
+assign ex3_sing_gr          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[32] && ex3_qt_sing_lo4_not0
+                            :  total_qt_rt_58[31] && ex3_qt_sing_lo3_not0;
+
+//the quotient round bits is equal to "10000"(ronnd bits 10..0)
+assign ex3_doub_eq          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[3] && !ex3_qt_doub_lo3_not0 
+                            :  total_qt_rt_58[2] && !ex3_qt_doub_lo2_not0;
+assign ex3_sing_eq          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[32] && !ex3_qt_sing_lo4_not0 
+                            :  total_qt_rt_58[31] && !ex3_qt_sing_lo3_not0;
+//the quotient round bits is zero
+assign ex3_doub_zero        = (total_qt_rt_58[56])
+                            ? ~|total_qt_rt_58[3:0]
+                            : ~|total_qt_rt_58[2:0];
+assign ex3_sing_zero        = (total_qt_rt_58[56])
+                            ? ~|total_qt_rt_58[32:28]
+                            : ~|total_qt_rt_58[31:28];
+//quotient is 1.00000..00 need special dealt with in the following
+assign ex3_doub_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:4];
+assign ex3_sing_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:33];
+// for denormal result, first select the quotation num for rounding
+//  specially for the result e=-126 and e=-1022,the denorm depends on the
+//  MSB of the quotient
+assign ex3_doub_denorm_plus       = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1c02);
+assign ex3_sing_denorm_plus       = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f82);
+
+assign ex3_doub_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1c01);
+assign ex3_sing_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f81);
+assign ex3_rslt_denorm            = ex3_denorm_plus || vfdsu_ex3_rslt_denorm;
+assign ex3_denorm_potnt_norm      = vfdsu_ex3_double ? ex3_doub_denorm_potnt_norm :
+                                                       vfdsu_ex3_single ? ex3_sing_denorm_potnt_norm
+                                                                        : ex3_half_denorm_potnt_norm;
+assign ex3_rst_eq_1         = (vfdsu_ex3_double)? ex3_doub_rst_eq_1 :
+                               vfdsu_ex3_single ? ex3_sing_rst_eq_1 : ex3_half_rst_eq_1;
+assign ex3_qt_eq            = (vfdsu_ex3_double)? ex3_doub_eq :
+                               vfdsu_ex3_single ? ex3_sing_eq : ex3_half_eq;
+assign ex3_qt_gr            = (vfdsu_ex3_double)? ex3_doub_gr :
+                               vfdsu_ex3_single ? ex3_sing_gr : ex3_half_gr;
+assign ex3_qt_zero          = (vfdsu_ex3_double)? ex3_doub_zero :
+                               vfdsu_ex3_single ? ex3_sing_zero : ex3_half_zero;
+assign ex3_denorm_plus            = (vfdsu_ex3_double)  ? ex3_doub_denorm_plus 
+                                    : vfdsu_ex3_single ? ex3_sing_denorm_plus
+                                                       : ex3_half_denorm_plus;
+                             
+// &CombBeg; @108
+always @( vfdsu_ex3_doub_expnt_rst[12:0]
+       or total_qt_rt_58[56:0])
+begin
+case(vfdsu_ex3_doub_expnt_rst[12:0])
+  13'h1c02:begin qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[3:0], 53'b0}; 
+                 double_denorm_lst_frac =  total_qt_rt_58[4];
+						end//-1022 1
+  13'h1c01:begin qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[4:0], 52'b0}; //-1023 0
+                 double_denorm_lst_frac =  total_qt_rt_58[5];
+						end//-1022 1
+  13'h1c00:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[5:0], 51'b0}; //-1024 -1
+                 double_denorm_lst_frac =  total_qt_rt_58[6];
+						end//-1022 1
+  13'h1bff:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[6:0], 50'b0}; //-1025 -2
+                 double_denorm_lst_frac =  total_qt_rt_58[7];
+						end//-1022 1
+  13'h1bfe:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[7:0], 49'b0}; //-1026 -3
+                 double_denorm_lst_frac =  total_qt_rt_58[8];
+						end//-1022 1
+  13'h1bfd:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[8:0], 48'b0}; //-1027 -4
+                 double_denorm_lst_frac =  total_qt_rt_58[9];
+						end//-1022 1
+  13'h1bfc:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[9:0], 47'b0}; //-1028 -5
+                 double_denorm_lst_frac =  total_qt_rt_58[10];
+						end//-1022 1
+  13'h1bfb:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[10:0],46'b0}; //-1029 -6
+                 double_denorm_lst_frac =  total_qt_rt_58[11];
+						end//-1022 1
+  13'h1bfa:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[11:0],45'b0}; //-1030 -7
+                 double_denorm_lst_frac =  total_qt_rt_58[12];
+						end//-1022 1
+  13'h1bf9:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[12:0],44'b0}; //-1031 -8
+                 double_denorm_lst_frac =  total_qt_rt_58[13];
+						end//-1022 1
+  13'h1bf8:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[13:0],43'b0}; //-1032 -9
+                 double_denorm_lst_frac =  total_qt_rt_58[14];
+						end//-1022 1
+  13'h1bf7:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[14:0],42'b0}; //-1033 -10
+                 double_denorm_lst_frac =  total_qt_rt_58[15];
+						end//-1022 1
+  13'h1bf6:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[15:0],41'b0}; //-1034 -11
+                 double_denorm_lst_frac =  total_qt_rt_58[16];
+						end//-1022 1
+  13'h1bf5:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[16:0],40'b0}; //-1035 -12
+                 double_denorm_lst_frac =  total_qt_rt_58[17];
+						end//-1022 1
+  13'h1bf4:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[17:0],39'b0}; //-1036 -13   
+                 double_denorm_lst_frac =  total_qt_rt_58[18];
+						end//-1022 1
+  13'h1bf3:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[18:0],38'b0}; // -1037
+                 double_denorm_lst_frac =  total_qt_rt_58[19];
+						end//-1022 1
+  13'h1bf2:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[19:0],37'b0}; //-1038
+                 double_denorm_lst_frac =  total_qt_rt_58[20];
+						end//-1022 1
+  13'h1bf1:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[20:0],36'b0}; //-1039
+                 double_denorm_lst_frac =  total_qt_rt_58[21];
+						end//-1022 1
+  13'h1bf0:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[21:0],35'b0}; //-1040
+                 double_denorm_lst_frac =  total_qt_rt_58[22];
+						end//-1022 1
+  13'h1bef:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[22:0],34'b0}; //-1041
+                 double_denorm_lst_frac =  total_qt_rt_58[23];
+						end//-1022 1
+  13'h1bee:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[23:0],33'b0}; //-1042
+                 double_denorm_lst_frac =  total_qt_rt_58[24];
+						end//-1022 1
+  13'h1bed:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[24:0],32'b0}; //-1043
+                 double_denorm_lst_frac =  total_qt_rt_58[25];
+						end//-1022 1
+  13'h1bec:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[25:0],31'b0}; //-1044
+                 double_denorm_lst_frac =  total_qt_rt_58[26];
+						end//-1022 1
+  13'h1beb:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[26:0],30'b0}; //-1045
+                 double_denorm_lst_frac =  total_qt_rt_58[27];
+						end//-1022 1
+  13'h1bea:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[27:0],29'b0}; //-1046
+                 double_denorm_lst_frac =  total_qt_rt_58[28];
+						end//-1022 1
+  13'h1be9:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[28:0],28'b0}; //-1047
+                 double_denorm_lst_frac =  total_qt_rt_58[29];
+						end//-1022 1
+  13'h1be8:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[29:0],27'b0}; //-1048
+                 double_denorm_lst_frac =  total_qt_rt_58[30];
+						end//-1022 1
+  13'h1be7:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[30:0],26'b0}; //-1049
+                 double_denorm_lst_frac =  total_qt_rt_58[31];
+						end//-1022 1
+  13'h1be6:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[31:0],25'b0}; //-1050
+                 double_denorm_lst_frac =  total_qt_rt_58[32];
+						end//-1022 1
+  13'h1be5:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[32:0],24'b0}; //-1056
+                 double_denorm_lst_frac =  total_qt_rt_58[33];
+						end//-1022 1
+  13'h1be4:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[33:0],23'b0}; //-1052
+                 double_denorm_lst_frac =  total_qt_rt_58[34];
+						end//-1022 1
+  13'h1be3:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[34:0],22'b0}; //-1053
+                 double_denorm_lst_frac =  total_qt_rt_58[35];
+						end//-1022 1
+  13'h1be2:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[35:0],21'b0}; //-1054
+                 double_denorm_lst_frac =  total_qt_rt_58[36];
+						end//-1022 1
+  13'h1be1:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[36:0],20'b0}; //-1055
+                 double_denorm_lst_frac =  total_qt_rt_58[37];
+						end//-1022 1
+  13'h1be0:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[37:0],19'b0}; //-1056
+                 double_denorm_lst_frac =  total_qt_rt_58[38];
+						end//-1022 1
+  13'h1bdf:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[38:0],18'b0}; //-1057
+                 double_denorm_lst_frac =  total_qt_rt_58[39];
+						end//-1022 1
+  13'h1bde:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[39:0],17'b0}; //-1058
+                 double_denorm_lst_frac =  total_qt_rt_58[40];
+						end//-1022 1
+  13'h1bdd:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[40:0],16'b0}; //-1059
+                 double_denorm_lst_frac =  total_qt_rt_58[41];
+						end//-1022 1
+  13'h1bdc:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[41:0],15'b0}; //-1060
+                 double_denorm_lst_frac =  total_qt_rt_58[42];
+						end//-1022 1
+  13'h1bdb:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[42:0],14'b0}; //-1061
+                 double_denorm_lst_frac =  total_qt_rt_58[43];
+						end//-1022 1
+  13'h1bda:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[43:0],13'b0}; //-1062
+                 double_denorm_lst_frac =  total_qt_rt_58[44];
+						end//-1022 1
+  13'h1bd9:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[44:0],12'b0}; //-1063
+                 double_denorm_lst_frac =  total_qt_rt_58[45];
+						end//-1022 1
+  13'h1bd8:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[45:0],11'b0}; //-1064
+                 double_denorm_lst_frac =  total_qt_rt_58[46];
+						end//-1022 1
+  13'h1bd7:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[46:0],10'b0}; //-1065
+                 double_denorm_lst_frac =  total_qt_rt_58[47];
+						end//-1022 1
+  13'h1bd6:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[47:0],9'b0};  //-1066
+                 double_denorm_lst_frac =  total_qt_rt_58[48];
+						end//-1022 1
+  13'h1bd5:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[48:0],8'b0};  //-1067
+                 double_denorm_lst_frac =  total_qt_rt_58[49];
+						end//-1022 1
+  13'h1bd4:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[49:0],7'b0};  //-1068
+                 double_denorm_lst_frac =  total_qt_rt_58[50];
+						end//-1022 1
+  13'h1bd3:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[50:0],6'b0};  //-1069
+                 double_denorm_lst_frac =  total_qt_rt_58[51];
+						end//-1022 1
+  13'h1bd2:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[51:0],5'b0};  //-1070
+                 double_denorm_lst_frac =  total_qt_rt_58[52];
+						end//-1022 1
+  13'h1bd1:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[52:0],4'b0};  //-1071
+                 double_denorm_lst_frac =  total_qt_rt_58[53];
+						end//-1022 1
+  13'h1bd0:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[53:0],3'b0};  //-1072
+                 double_denorm_lst_frac =  total_qt_rt_58[54];
+						end//-1022 1
+  13'h1bcf:begin  qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[54:0],2'b0};  //-1073
+                 double_denorm_lst_frac =  total_qt_rt_58[55];
+						end//-1022 1
+  13'h1bce:begin qt_result_double_denorm_for_round[56:0] = {total_qt_rt_58[55:0],1'b0};
+                 double_denorm_lst_frac =  total_qt_rt_58[56];
+						end//-1022 1
+  default:begin qt_result_double_denorm_for_round[56:0] = total_qt_rt_58[56:0];
+                 double_denorm_lst_frac =  1'b0;
+						end//-1022 1
+
+endcase                                                                     
+// &CombEnd; @274
+end
+//denomal result, check for rounding further optimization can be done in
+//future
+assign ex3_double_denorm_eq      = qt_result_double_denorm_for_round[56] 
+                                   &&  !ex3_double_low_not_zero;
+assign ex3_double_low_not_zero   = |qt_result_double_denorm_for_round[55:0];
+assign ex3_double_denorm_gr      = qt_result_double_denorm_for_round[56] 
+                                   &&  ex3_double_low_not_zero;
+assign ex3_double_denorm_zero    = !qt_result_double_denorm_for_round[56] 
+                                   &&  !ex3_double_low_not_zero;
+
+// &CombBeg; @285
+always @( vfdsu_ex3_sing_expnt_rst[8:0]
+       or total_qt_rt_58[56:28])
+begin
+case(vfdsu_ex3_sing_expnt_rst[8:0])
+  9'h182:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[32:28],23'b0}; //-126 1
+                single_denorm_lst_frac =  total_qt_rt_58[33];
+			 		end//-1022 1
+  9'h181:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[33:28],22'b0}; //-127 0
+                single_denorm_lst_frac =  total_qt_rt_58[34];
+			 		end//-1022 1
+  9'h180:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[34:28],21'b0}; //-128 -1
+                single_denorm_lst_frac =  total_qt_rt_58[35];
+			 		end//-1022 1
+  9'h17f:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[35:28],20'b0}; //-129 -2
+                single_denorm_lst_frac =  total_qt_rt_58[36];
+			 		end//-1022 1
+  9'h17e:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[36:28],19'b0}; //-90 -3
+                single_denorm_lst_frac =  total_qt_rt_58[37];
+			 		end//-1022 1
+  9'h17d:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[37:28],18'b0}; //-91 -4
+                single_denorm_lst_frac =  total_qt_rt_58[38];
+			 		end//-1022 1
+  9'h17c:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[38:28],17'b0}; //-92 -5
+                single_denorm_lst_frac =  total_qt_rt_58[39];
+			 		end//-1022 1
+  9'h17b:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[39:28],16'b0}; //-93 -6
+                single_denorm_lst_frac =  total_qt_rt_58[40];
+			 		end//-1022 1
+  9'h17a:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[40:28],15'b0}; //-94 -7
+                single_denorm_lst_frac =  total_qt_rt_58[41];
+			 		end//-1022 1
+  9'h179:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[41:28],14'b0}; //-95 -8
+                single_denorm_lst_frac =  total_qt_rt_58[42];
+			 		end//-1022 1
+  9'h178:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[42:28],13'b0}; //-96 -9
+                single_denorm_lst_frac =  total_qt_rt_58[43];
+			 		end//-1022 1
+  9'h177:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[43:28],12'b0}; //-97 -10
+                single_denorm_lst_frac =  total_qt_rt_58[44];
+			 		end//-1022 1
+  9'h176:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[44:28],11'b0}; //-98 -11
+                single_denorm_lst_frac =  total_qt_rt_58[45];
+			 		end//-1022 1
+  9'h175:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[45:28],10'b0}; //-99 -12
+                single_denorm_lst_frac =  total_qt_rt_58[46];
+			 		end//-1022 1
+  9'h174:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[46:28],9'b0}; //-140 -9   
+                single_denorm_lst_frac =  total_qt_rt_58[47];
+			 		end//-1022 1
+  9'h173:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[47:28],8'b0}; // -141
+                single_denorm_lst_frac =  total_qt_rt_58[48];
+			 		end//-1022 1
+  9'h172:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[48:28],7'b0};//-142
+                single_denorm_lst_frac =  total_qt_rt_58[49];
+			 		end//-1022 1
+  9'h171:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[49:28],6'b0};//-143
+                single_denorm_lst_frac =  total_qt_rt_58[50];
+			 		end//-1022 1
+  9'h170:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[50:28],5'b0}; //-144
+                single_denorm_lst_frac =  total_qt_rt_58[51];
+			 		end//-1022 1
+  9'h16f:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[51:28],4'b0}; //-145
+                single_denorm_lst_frac =  total_qt_rt_58[52];
+			 		end//-1022 1
+  9'h16e:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[52:28],3'b0}; //-146
+                single_denorm_lst_frac =  total_qt_rt_58[53];
+			 		end//-1022 1
+  9'h16d:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[53:28],2'b0}; //-147
+                single_denorm_lst_frac =  total_qt_rt_58[54];
+			 		end//-1022 1
+  9'h16c:begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[54:28],1'b0}; //-148
+                single_denorm_lst_frac =  total_qt_rt_58[55];
+			 		end//-1022 1
+  9'h16b: begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[55:28]};
+                 single_denorm_lst_frac = total_qt_rt_58[56] ;
+						end//-1022 1
+  default:  begin qt_result_single_denorm_for_round[27:0] = {total_qt_rt_58[56:29]};
+                 single_denorm_lst_frac = 1'b0;
+						end//-1022 1
+endcase
+// &CombEnd;  @363
+end
+//rounding evaluation for single denormalize number 
+assign ex3_single_denorm_eq      = qt_result_single_denorm_for_round[27] 
+                                   &&  !ex3_single_low_not_zero;
+assign ex3_single_low_not_zero   = |qt_result_single_denorm_for_round[26:0];
+assign ex3_single_denorm_gr      = qt_result_single_denorm_for_round[27] 
+                                   &&  ex3_single_low_not_zero;
+assign ex3_single_denorm_zero    = !qt_result_single_denorm_for_round[27] 
+                                   && !ex3_single_low_not_zero;
+// &CombBeg; @372
+always @( total_qt_rt_58[56:42]
+       or vfdsu_ex3_half_expnt_rst[12:0])
+begin
+case(vfdsu_ex3_half_expnt_rst[12:0])
+  13'h1ff2:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[45:42],10'b0}; //-14 1
+                 half_denorm_lst_frac =  total_qt_rt_58[46];
+						end//-1022 1
+  13'h1ff1:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[46:42],9'b0}; //-15 0
+                 half_denorm_lst_frac =  total_qt_rt_58[47];
+						end//-1022 1
+  13'h1ff0:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[47:42],8'b0}; //-16 -1
+                 half_denorm_lst_frac =  total_qt_rt_58[48];
+						end//-1022 1
+  13'h1fef:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[48:42],7'b0}; //-17 -2
+                 half_denorm_lst_frac =  total_qt_rt_58[49];
+						end//-1022 1
+  13'h1fee:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[49:42],6'b0}; //-18 -3
+                 half_denorm_lst_frac =  total_qt_rt_58[50];
+						end//-1022 1
+  13'h1fed:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[50:42],5'b0}; //-19 -4
+                 half_denorm_lst_frac =  total_qt_rt_58[51];
+						end//-1022 1
+  13'h1fec:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[51:42],4'b0}; //-20 -5
+                 half_denorm_lst_frac =  total_qt_rt_58[52];
+						end//-1022 1
+  13'h1feb:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[52:42],3'b0}; //-21 -6
+                 half_denorm_lst_frac =  total_qt_rt_58[53];
+						end//-1022 1
+  13'h1fea:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[53:42],2'b0}; //-22 -7
+                 half_denorm_lst_frac =  total_qt_rt_58[54];
+						end//-1022 1
+  13'h1fe9:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[54:42],1'b0}; //-23 -8
+                 half_denorm_lst_frac =  total_qt_rt_58[55];
+						end//-1022 1
+  13'h1fe8:begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[55:42]}; //-24 -9
+                 half_denorm_lst_frac =  total_qt_rt_58[56];
+						end//-1022 1
+  default:  begin qt_result_half_denorm_for_round[13:0] = {total_qt_rt_58[56:43]};
+                 half_denorm_lst_frac = 1'b0;
+						end//-1022 1
+endcase
+// &CombEnd;  @411
+end
+//rounding evaluation for single denormalize number 
+assign ex3_half_denorm_eq      = qt_result_half_denorm_for_round[13] 
+                                   &&  !ex3_half_low_not_zero;
+assign ex3_half_low_not_zero   = |qt_result_half_denorm_for_round[12:0];
+assign ex3_half_denorm_gr      = qt_result_half_denorm_for_round[13] 
+                                   &&  ex3_half_low_not_zero;
+assign ex3_half_denorm_zero    = !qt_result_half_denorm_for_round[13] 
+                                   && !ex3_half_low_not_zero;
+
+assign ex3_denorm_eq             = vfdsu_ex3_double ? ex3_double_denorm_eq :
+                                   vfdsu_ex3_single ? ex3_single_denorm_eq : ex3_half_denorm_eq;
+assign ex3_denorm_gr             = vfdsu_ex3_double ? ex3_double_denorm_gr :
+                                   vfdsu_ex3_single ? ex3_single_denorm_gr : ex3_half_denorm_gr;
+assign ex3_denorm_zero           = vfdsu_ex3_double ? ex3_double_denorm_zero :
+                                   vfdsu_ex3_single ? ex3_single_denorm_zero : ex3_half_denorm_zero;
+assign ex3_denorm_lst_frac       = vfdsu_ex3_double ? double_denorm_lst_frac :
+                                   vfdsu_ex3_single ? single_denorm_lst_frac : half_denorm_lst_frac;
+  
+//Different Round Mode with different rounding rule
+//Here we call rounding bit as "rb", remainder as "rem"
+//RNE : 
+//  1.+1 : rb>10000 || rb==10000 && rem>0
+//  2. 0 : Rest Condition
+//  3.-1 : Never occur
+//RTZ : 
+//  1.+1 : Never occur
+//  2. 0 : Rest Condition
+//  3.-1 : rb=10000 && rem<0
+//RDN : 
+//  1.+1 : Q>0 Never occur   ; Q<0 Rest condition
+//  2. 0 : Q>0 Rest condition; Q<0 Rem<0 && rb=0 
+//  3.-1 : Q>0 Rem<0 && rb=0 ; Q<0 Never occur
+//RUP : 
+//  1.+1 : Q>0 Rest Condition; Q<0 Never occur
+//  2. 0 : Q>0 Rem<0 && rb=0 ; Q<0 Rest condition
+//  3.-1 : Q>0 Never occur   ; Q<0 Rem<0 && rb=0 
+//RMM : 
+//  1.+1 : rb>10000 || rb==10000 && rem>0
+//  2. 0 : Rest Condition
+//  3.-1 : Never occur
+assign frac_rne_add_1 = ex3_qt_gr || 
+                       (ex3_qt_eq && !vfdsu_ex3_rem_sign); 
+assign frac_rtz_sub_1 = ex3_qt_zero && vfdsu_ex3_rem_sign;
+assign frac_rup_add_1 = !vfdsu_ex3_result_sign && 
+                       (!ex3_qt_zero || 
+                       (!vfdsu_ex3_rem_sign && !vfdsu_ex3_rem_zero)); 
+assign frac_rup_sub_1 = vfdsu_ex3_result_sign && 
+                       (ex3_qt_zero && vfdsu_ex3_rem_sign);
+assign frac_rdn_add_1 = vfdsu_ex3_result_sign && 
+                       (!ex3_qt_zero || 
+                       (!vfdsu_ex3_rem_sign && !vfdsu_ex3_rem_zero));
+assign frac_rdn_sub_1 = !vfdsu_ex3_result_sign &&
+                       (ex3_qt_zero && vfdsu_ex3_rem_sign);
+assign frac_rmm_add_1 = ex3_qt_gr || 
+                       (ex3_qt_eq && !vfdsu_ex3_rem_sign); 
+//denormal result 
+assign frac_denorm_rne_add_1 = ex3_denorm_gr || 
+                               (ex3_denorm_eq && 
+                               ((vfdsu_ex3_rem_zero &&
+                                ex3_denorm_lst_frac) ||
+                               (!vfdsu_ex3_rem_zero && 
+                                !vfdsu_ex3_rem_sign)));
+assign frac_denorm_rtz_sub_1 = ex3_denorm_zero && vfdsu_ex3_rem_sign;
+assign frac_denorm_rup_add_1 = !vfdsu_ex3_result_sign && 
+                               (!ex3_denorm_zero || 
+                               (!vfdsu_ex3_rem_sign && !vfdsu_ex3_rem_zero)); 
+assign frac_denorm_rup_sub_1 = vfdsu_ex3_result_sign && 
+                       (ex3_denorm_zero && vfdsu_ex3_rem_sign);
+assign frac_denorm_rdn_add_1 = vfdsu_ex3_result_sign && 
+                       (!ex3_denorm_zero || 
+                       (!vfdsu_ex3_rem_sign && !vfdsu_ex3_rem_zero));
+assign frac_denorm_rdn_sub_1 = !vfdsu_ex3_result_sign &&
+                       (ex3_denorm_zero && vfdsu_ex3_rem_sign);
+assign frac_denorm_rmm_add_1 = ex3_denorm_gr || 
+                       (ex3_denorm_eq && !vfdsu_ex3_rem_sign);
+
+//RM select
+// &CombBeg; @489
+always @( vfdsu_ex3_result_sign
+       or frac_rtz_sub_1
+       or frac_rdn_add_1
+       or frac_denorm_rtz_sub_1
+       or frac_rup_sub_1
+       or frac_denorm_rmm_add_1
+       or frac_denorm_rne_add_1
+       or frac_rmm_add_1
+       or frac_denorm_rdn_add_1
+       or frac_rne_add_1
+       or frac_denorm_rdn_sub_1
+       or frac_rup_add_1
+       or frac_denorm_rup_sub_1
+       or frac_rdn_sub_1
+       or ex3_rslt_denorm
+       or vfdsu_ex3_rm[2:0]
+       or frac_denorm_rup_add_1
+       or vfdsu_ex3_id_srt_skip)
+begin
+case(vfdsu_ex3_rm[2:0])
+  3'b000://round to nearst,ties to even
+  begin 
+    frac_add_1          =  ex3_rslt_denorm ? frac_denorm_rne_add_1 : frac_rne_add_1;
+    frac_sub_1          =  1'b0;
+    frac_orig           =  ex3_rslt_denorm ? !frac_denorm_rne_add_1 : !frac_rne_add_1;
+    denorm_to_tiny_frac =  vfdsu_ex3_id_srt_skip ? 1'b0 : frac_denorm_rne_add_1;
+  end
+  3'b001:// round to 0
+  begin 
+    frac_add_1           =  1'b0;
+    frac_sub_1           =  ex3_rslt_denorm ? frac_denorm_rtz_sub_1 : frac_rtz_sub_1;
+    frac_orig            =  ex3_rslt_denorm ? !frac_denorm_rtz_sub_1 : !frac_rtz_sub_1;
+    denorm_to_tiny_frac  = 1'b0;
+  end
+  3'b010://round to -inf
+  begin 
+    frac_add_1          =  ex3_rslt_denorm ? frac_denorm_rdn_add_1 : frac_rdn_add_1;
+    frac_sub_1          =  ex3_rslt_denorm ? frac_denorm_rdn_sub_1 : frac_rdn_sub_1;
+    frac_orig           =  ex3_rslt_denorm ? !frac_denorm_rdn_add_1 && !frac_denorm_rdn_sub_1 
+                                           : !frac_rdn_add_1 && !frac_rdn_sub_1;
+    denorm_to_tiny_frac = vfdsu_ex3_id_srt_skip ? vfdsu_ex3_result_sign 
+                                                : frac_denorm_rdn_add_1;
+  end
+  3'b011://round to +inf
+  begin 
+    frac_add_1          =  ex3_rslt_denorm ? frac_denorm_rup_add_1 : frac_rup_add_1;
+    frac_sub_1          =  ex3_rslt_denorm ? frac_denorm_rup_sub_1 : frac_rup_sub_1; 
+    frac_orig           =  ex3_rslt_denorm ? !frac_denorm_rup_add_1 && !frac_denorm_rup_sub_1 
+                                           : !frac_rup_add_1 && !frac_rup_sub_1; 
+    denorm_to_tiny_frac = vfdsu_ex3_id_srt_skip ? !vfdsu_ex3_result_sign 
+                                                : frac_denorm_rup_add_1;
+  end
+  3'b100://round to nearest,ties to max magnitude
+  begin 
+    frac_add_1          = ex3_rslt_denorm ? frac_denorm_rmm_add_1 : frac_rmm_add_1;
+    frac_sub_1          = 1'b0;
+    frac_orig           = ex3_rslt_denorm ? !frac_denorm_rmm_add_1 : !frac_rmm_add_1;
+    denorm_to_tiny_frac = vfdsu_ex3_id_srt_skip ? 1'b0 : frac_denorm_rmm_add_1;
+  end
+  default: 
+  begin 
+    frac_add_1          = 1'b0;
+    frac_sub_1          = 1'b0;
+    frac_orig           = 1'b0;
+    denorm_to_tiny_frac = 1'b0;
+  end
+endcase
+// &CombEnd; @538
+end
+//Add 1 or Sub 1 constant
+// &CombBeg; @540
+always @( total_qt_rt_58[56]
+       or vfdsu_ex3_single
+       or vfdsu_ex3_double)
+begin
+case({total_qt_rt_58[56],vfdsu_ex3_double,vfdsu_ex3_single})
+  3'b001: 
+  begin
+    frac_add1_op1[54:0] = {2'b0,24'b1,29'b0};
+    frac_sub1_op1[54:0] = {2'b11,{24{1'b1}},29'b0};
+  end
+  3'b010: 
+  begin
+    frac_add1_op1[54:0] = 55'b1;
+    frac_sub1_op1[54:0] = {55{1'b1}};
+  end
+  3'b101: 
+  begin
+    frac_add1_op1[54:0] = {25'b1,30'b0};
+    frac_sub1_op1[54:0] = {{25{1'b1}},30'b0};
+  end
+  3'b110: 
+  begin
+    frac_add1_op1[54:0] = 55'b10;
+    frac_sub1_op1[54:0] = {{54{1'b1}},1'b0};
+  end
+  3'b100:
+  begin
+    frac_add1_op1[54:0] = {12'b1,43'b0};
+    frac_sub1_op1[54:0] = {{12{1'b1}},43'b0};
+  end
+  3'b000:
+  begin
+    frac_add1_op1[54:0] = {13'b1,42'b0};
+    frac_sub1_op1[54:0] = {{13{1'b1}},42'b0};
+  end
+  default:
+  begin
+    frac_add1_op1[54:0] = 55'b0;
+    frac_sub1_op1[54:0] = 55'b0;
+  end
+endcase
+// &CombEnd; @578
+end
+//Add 1 or Sub1 final result
+//Conner case when quotient is 0.010000...00 and remainder is negative,
+//The real quotient is actually 0.00fff..ff, 
+//The final result will need to sub 1 when
+//RN : Never occur
+//RP : sign of quotient is -
+//RM : sign of quotient is +
+assign frac_add1_rst[54:0]             = {1'b0,total_qt_rt_58[56:3]} +
+                                         frac_add1_op1_with_denorm[54:0];
+assign frac_add1_op1_with_denorm[54:0] = ex3_rslt_denorm ? 
+                                  {1'b0,vfdsu_ex3_result_denorm_round_add_num[52:0],1'b0} :
+                                  frac_add1_op1[54:0];      
+assign frac_sub1_rst[54:0]             = (ex3_rst_eq_1)
+                                       ? {2'b0,{53{1'b1}}}
+                                       : {1'b0,total_qt_rt_58[56:3]} +
+                                         frac_sub1_op1_with_denorm[54:0] + {54'b0,ex3_rslt_denorm};
+assign frac_sub1_op1_with_denorm[54:0] = ex3_rslt_denorm ?
+                                ~{1'b0,vfdsu_ex3_result_denorm_round_add_num[52:0],1'b0} :
+                                frac_sub1_op1[54:0];
+assign frac_final_rst[54:0]           = (frac_add1_rst[54:0]         & {55{frac_add_1}}) |
+                                        (frac_sub1_rst[54:0]         & {55{frac_sub_1}}) |
+                                        ({1'b0,total_qt_rt_58[56:3]} & {55{frac_orig}});
+
+//===============Pipe down signal prepare===================
+assign ex3_rst_nor = !vfdsu_ex3_result_zero && 
+                     !vfdsu_ex3_result_qnan && 
+                     !vfdsu_ex3_result_inf  && 
+                     !vfdsu_ex3_result_lfn;
+assign ex3_nx      = ex3_rst_nor && 
+                    (!ex3_qt_zero || !vfdsu_ex3_rem_zero || ex3_denorm_nx);
+assign ex3_denorm_nx = ex3_rslt_denorm && (!ex3_denorm_zero ||  !vfdsu_ex3_rem_zero);
+//Adjust expnt
+//Div:Actural expnt should plus 1 when op0 is id, sub 1 when op1 id
+assign ex3_expnt_adjst[12:0] = vfdsu_ex3_double ? 13'h3ff: vfdsu_ex3_single ? 13'h7f : 13'hf;
+assign ex3_expnt_adjust_result[12:0] = vfdsu_ex3_expnt_rst[12:0] + 
+                                       ex3_expnt_adjst[12:0];
+//this information is for the packing, which determin the result is normal
+//numer or not;
+assign ex3_potnt_norm[1:0]    = {ex3_denorm_plus,ex3_denorm_potnt_norm};
+//=======================Pipe to EX4========================
+//gate clk
+// &Instance("gated_clk_cell","x_ex3_pipe_clk"); @620
+gated_clk_cell  x_ex3_pipe_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex3_pipe_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex3_pipe_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @621
+//           .clk_out        (ex3_pipe_clk),//Out Clock @622
+//           .external_en    (1'b0), @623
+//           .global_en      (cp0_yy_clk_en), @624
+//           .local_en       (ex3_pipe_clk_en),//Local Condition @625
+//           .module_en      (cp0_vfpu_icg_en) @626
+//         ); @627
+assign ex3_pipe_clk_en = ex3_pipedown;
+
+always @(posedge ex3_pipe_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex4_result_zero     <=  1'b0;
+    vfdsu_ex4_result_qnan     <=  1'b0;
+    vfdsu_ex4_result_inf      <=  1'b0;
+    vfdsu_ex4_result_lfn      <=  1'b0;
+    vfdsu_ex4_result_sign     <=  1'b0;
+    vfdsu_ex4_potnt_of        <=  1'b0;
+    vfdsu_ex4_potnt_uf        <=  1'b0;
+    vfdsu_ex4_result_nor      <=  1'b0;
+    vfdsu_ex4_expnt_rst[12:0] <= 13'b0;
+    vfdsu_ex4_nv              <=  1'b0; 
+    vfdsu_ex4_nx              <=  1'b0; 
+    vfdsu_ex4_uf              <=  1'b0; 
+    vfdsu_ex4_of              <=  1'b0; 
+    vfdsu_ex4_dz              <=  1'b0; 
+    vfdsu_ex4_of_rst_lfn      <=  1'b0;
+    vfdsu_ex4_frac[54:0]      <= 55'b0;
+    vfdsu_ex4_qnan_sign       <=  1'b0;    
+    vfdsu_ex4_qnan_f[51:0]    <= 52'b0;
+    vfdsu_ex4_rslt_denorm     <= 1'b0;
+    vfdsu_ex4_denorm_to_tiny_frac
+                              <= 1'b0;
+    vfdsu_ex4_potnt_norm[1:0] <= 2'b0;
+    vfdsu_ex4_double          <= 1'b0;
+    vfdsu_ex4_single          <= 1'b0;
+
+  end
+  else if(ex3_pipedown)
+  begin
+    vfdsu_ex4_result_zero     <= vfdsu_ex3_result_zero;
+    vfdsu_ex4_result_qnan     <= vfdsu_ex3_result_qnan;
+    vfdsu_ex4_result_inf      <= vfdsu_ex3_result_inf;
+    vfdsu_ex4_result_lfn      <= vfdsu_ex3_result_lfn;
+    vfdsu_ex4_result_sign     <= vfdsu_ex3_result_sign;
+    vfdsu_ex4_potnt_of        <= vfdsu_ex3_potnt_of;
+    vfdsu_ex4_potnt_uf        <= vfdsu_ex3_potnt_uf;
+    vfdsu_ex4_result_nor      <= ex3_rst_nor;
+    vfdsu_ex4_expnt_rst[12:0] <= ex3_expnt_adjust_result[12:0];
+    vfdsu_ex4_nv              <= vfdsu_ex3_nv; 
+    vfdsu_ex4_nx              <= ex3_nx; 
+    vfdsu_ex4_uf              <= vfdsu_ex3_uf; 
+    vfdsu_ex4_of              <= vfdsu_ex3_of; 
+    vfdsu_ex4_dz              <= vfdsu_ex3_dz; 
+    vfdsu_ex4_of_rst_lfn      <= vfdsu_ex2_of_rm_lfn;
+    vfdsu_ex4_frac[54:0]      <= frac_final_rst[54:0];
+    vfdsu_ex4_qnan_sign       <= vfdsu_ex3_qnan_sign;    
+    vfdsu_ex4_qnan_f[51:0]    <= vfdsu_ex3_qnan_f[51:0];
+    vfdsu_ex4_rslt_denorm     <= ex3_rslt_denorm;
+    vfdsu_ex4_denorm_to_tiny_frac 
+                              <= denorm_to_tiny_frac;
+    vfdsu_ex4_potnt_norm[1:0] <= ex3_potnt_norm[1:0];
+    vfdsu_ex4_double          <= vfdsu_ex3_double;
+    vfdsu_ex4_single          <= vfdsu_ex3_single;
+  end
+  else
+  begin
+    vfdsu_ex4_result_zero     <= vfdsu_ex4_result_zero;
+    vfdsu_ex4_result_qnan     <= vfdsu_ex4_result_qnan;
+    vfdsu_ex4_result_inf      <= vfdsu_ex4_result_inf;
+    vfdsu_ex4_result_lfn      <= vfdsu_ex4_result_lfn;
+    vfdsu_ex4_result_sign     <= vfdsu_ex4_result_sign;
+    vfdsu_ex4_potnt_of        <= vfdsu_ex4_potnt_of;
+    vfdsu_ex4_potnt_uf        <= vfdsu_ex4_potnt_uf;
+    vfdsu_ex4_result_nor      <= vfdsu_ex4_result_nor;
+    vfdsu_ex4_expnt_rst[12:0] <= vfdsu_ex4_expnt_rst[12:0];
+    vfdsu_ex4_nv              <= vfdsu_ex4_nv; 
+    vfdsu_ex4_nx              <= vfdsu_ex4_nx; 
+    vfdsu_ex4_uf              <= vfdsu_ex4_uf; 
+    vfdsu_ex4_of              <= vfdsu_ex4_of; 
+    vfdsu_ex4_dz              <= vfdsu_ex4_dz; 
+    vfdsu_ex4_of_rst_lfn      <= vfdsu_ex4_of_rst_lfn;
+    vfdsu_ex4_frac[54:0]      <= vfdsu_ex4_frac[54:0];
+    vfdsu_ex4_qnan_sign       <= vfdsu_ex4_qnan_sign;
+    vfdsu_ex4_qnan_f[51:0]    <= vfdsu_ex4_qnan_f[51:0];
+    vfdsu_ex4_rslt_denorm     <= vfdsu_ex4_rslt_denorm;
+    vfdsu_ex4_denorm_to_tiny_frac 
+                              <= vfdsu_ex4_denorm_to_tiny_frac;
+    vfdsu_ex4_potnt_norm[1:0] <= vfdsu_ex4_potnt_norm[1:0];
+    vfdsu_ex4_double          <= vfdsu_ex4_double;
+    vfdsu_ex4_single          <= vfdsu_ex4_single;
+  end  
+end    
+
+// &Force("output","vfdsu_ex4_result_nor"); @716
+// &Force("output","vfdsu_ex4_nx"); @717
+// &Force("output","vfdsu_ex4_nv"); @718
+// &Force("output","vfdsu_ex4_uf"); @719
+// &Force("output","vfdsu_ex4_of"); @720
+// &Force("output","vfdsu_ex4_dz"); @721
+// &Force("output","vfdsu_ex4_result_sign"); @722
+// &Force("output","vfdsu_ex4_of_rst_lfn"); @723
+// &Force("output","vfdsu_ex4_potnt_of"); @724
+// &Force("output","vfdsu_ex4_potnt_uf"); @725
+// &Force("output","vfdsu_ex4_result_inf"); @726
+// &Force("output","vfdsu_ex4_result_lfn"); @727
+// &Force("output","vfdsu_ex4_result_qnan"); @728
+// &Force("output","vfdsu_ex4_result_zero"); @729
+// &Force("output","vfdsu_ex4_frac"); @730
+// &Force("output","vfdsu_ex4_expnt_rst"); @731
+// &Force("output","vfdsu_ex4_qnan_sign"); @732
+// &Force("output","vfdsu_ex4_qnan_f"); @733
+// &Force("output","vfdsu_ex4_rslt_denorm"); @734
+// &Force("output","vfdsu_ex4_denorm_to_tiny_frac"); @735
+// &Force("output","vfdsu_ex4_potnt_norm"); @736
+// &Force("output","vfdsu_ex4_double"); @737
+// &Force("output","vfdsu_ex4_single"); @738
+// &ModuleEnd; @739
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
new file mode 100644
index 00000000..c7a679c1
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
@@ -0,0 +1,323 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &Depend("cpu_cfig.h"); @22
+// &ModuleBeg; @23
+module ct_vfdsu_scalar_dp(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  dp_vfdsu_ex1_pipex_dst_ereg,
+  dp_vfdsu_ex1_pipex_dst_vreg,
+  dp_vfdsu_ex1_pipex_iid,
+  dp_vfdsu_ex1_pipex_imm0,
+  dp_vfdsu_ex1_pipex_srcf0,
+  dp_vfdsu_ex1_pipex_srcf1,
+  ex1_data_clk,
+  ex1_div,
+  ex1_double,
+  ex1_pipedown,
+  ex1_scalar,
+  ex1_single,
+  ex1_sqrt,
+  ex1_src0,
+  ex1_src1,
+  ex1_static_rm,
+  ex2_data_clk,
+  ex2_pipedown,
+  ex3_data_clk,
+  ex3_pipedown,
+  ex4_out_expt,
+  ex4_out_result,
+  forever_cpuclk,
+  idu_vfpu_rf_pipex_func,
+  idu_vfpu_rf_pipex_gateclk_sel,
+  pad_yy_icg_scan_en,
+  pipex_dp_vfdsu_ereg,
+  pipex_dp_vfdsu_ereg_data,
+  pipex_dp_vfdsu_freg_data,
+  pipex_dp_vfdsu_vreg,
+  vfdsu_ex2_double,
+  vfdsu_ex2_single
+);
+
+// &Ports; @24
+input           cp0_vfpu_icg_en;              
+input           cp0_yy_clk_en;                
+input           cpurst_b;                     
+input   [4 :0]  dp_vfdsu_ex1_pipex_dst_ereg;  
+input   [6 :0]  dp_vfdsu_ex1_pipex_dst_vreg;  
+input   [6 :0]  dp_vfdsu_ex1_pipex_iid;       
+input   [2 :0]  dp_vfdsu_ex1_pipex_imm0;      
+input   [63:0]  dp_vfdsu_ex1_pipex_srcf0;     
+input   [63:0]  dp_vfdsu_ex1_pipex_srcf1;     
+input           ex1_data_clk;                 
+input           ex1_pipedown;                 
+input           ex2_data_clk;                 
+input           ex2_pipedown;                 
+input           ex3_data_clk;                 
+input           ex3_pipedown;                 
+input   [4 :0]  ex4_out_expt;                 
+input   [63:0]  ex4_out_result;               
+input           forever_cpuclk;               
+input   [19:0]  idu_vfpu_rf_pipex_func;       
+input           idu_vfpu_rf_pipex_gateclk_sel; 
+input           pad_yy_icg_scan_en;           
+output          ex1_div;                      
+output          ex1_double;                   
+output          ex1_scalar;                   
+output          ex1_single;                   
+output          ex1_sqrt;                     
+output  [63:0]  ex1_src0;                     
+output  [63:0]  ex1_src1;                     
+output  [2 :0]  ex1_static_rm;                
+output  [4 :0]  pipex_dp_vfdsu_ereg;          
+output  [4 :0]  pipex_dp_vfdsu_ereg_data;     
+output  [63:0]  pipex_dp_vfdsu_freg_data;     
+output  [6 :0]  pipex_dp_vfdsu_vreg;          
+output          vfdsu_ex2_double;             
+output          vfdsu_ex2_single;             
+
+// &Regs; @25
+reg             ex1_div;                      
+reg             ex1_double;                   
+reg             ex1_single;                   
+reg             ex1_sqrt;                     
+reg             vfdsu_ex2_div;                
+reg             vfdsu_ex2_double;             
+reg     [4 :0]  vfdsu_ex2_dst_ereg;           
+reg     [6 :0]  vfdsu_ex2_dst_vreg;           
+reg     [6 :0]  vfdsu_ex2_iid;                
+reg             vfdsu_ex2_single;             
+reg             vfdsu_ex2_sqrt;               
+reg     [4 :0]  vfdsu_ex3_dst_ereg;           
+reg     [6 :0]  vfdsu_ex3_dst_vreg;           
+reg     [6 :0]  vfdsu_ex3_iid;                
+reg     [4 :0]  vfdsu_ex4_dst_ereg;           
+reg     [6 :0]  vfdsu_ex4_dst_vreg;           
+reg     [6 :0]  vfdsu_ex4_iid;                
+
+// &Wires; @26
+wire            cp0_vfpu_icg_en;              
+wire            cp0_yy_clk_en;                
+wire            cpurst_b;                     
+wire    [4 :0]  dp_vfdsu_ex1_pipex_dst_ereg;  
+wire    [6 :0]  dp_vfdsu_ex1_pipex_dst_vreg;  
+wire    [6 :0]  dp_vfdsu_ex1_pipex_iid;       
+wire    [2 :0]  dp_vfdsu_ex1_pipex_imm0;      
+wire    [63:0]  dp_vfdsu_ex1_pipex_srcf0;     
+wire    [63:0]  dp_vfdsu_ex1_pipex_srcf1;     
+wire            ex1_data_clk;                 
+wire            ex1_pipedown;                 
+wire            ex1_scalar;                   
+wire    [63:0]  ex1_src0;                     
+wire    [63:0]  ex1_src1;                     
+wire    [2 :0]  ex1_static_rm;                
+wire            ex2_data_clk;                 
+wire            ex2_pipedown;                 
+wire            ex3_data_clk;                 
+wire            ex3_pipedown;                 
+wire    [4 :0]  ex4_out_expt;                 
+wire    [63:0]  ex4_out_result;               
+wire            forever_cpuclk;               
+wire    [19:0]  idu_vfpu_rf_pipex_func;       
+wire            idu_vfpu_rf_pipex_gateclk_sel; 
+wire            pad_yy_icg_scan_en;           
+wire    [4 :0]  pipex_dp_vfdsu_ereg;          
+wire    [4 :0]  pipex_dp_vfdsu_ereg_data;     
+wire    [63:0]  pipex_dp_vfdsu_freg_data;     
+wire    [6 :0]  pipex_dp_vfdsu_vreg;          
+wire            vfdsu_sew_clk;                
+wire            vfdsu_sew_clk_en;             
+
+
+//==========================================================
+//              EX1 Stage Control Signal
+//==========================================================
+// &Force("bus","idu_vfpu_rf_pipex_func",19,0); @31
+//assign func[19:0]         = dp_vfdsu_ex1_pipex_func[19:0];
+// &Instance("gated_clk_cell","x_vfdsu_sew_clk"); @33
+gated_clk_cell  x_vfdsu_sew_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (vfdsu_sew_clk     ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (vfdsu_sew_clk_en  ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @34
+//           .clk_out        (vfdsu_sew_clk),//Out Clock @35
+//           .external_en    (1'b0), @36
+//           .global_en      (cp0_yy_clk_en), @37
+//           .local_en       (vfdsu_sew_clk_en),//Local Condition @38
+//           .module_en      (cp0_vfpu_icg_en) @39
+//         ); @40
+assign  vfdsu_sew_clk_en = idu_vfpu_rf_pipex_gateclk_sel;       
+always @(posedge vfdsu_sew_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    ex1_div            <= 1'b0;
+    ex1_sqrt           <= 1'b0;
+    ex1_double         <= 1'b0;
+    ex1_single         <= 1'b0;
+  end
+  else if(idu_vfpu_rf_pipex_gateclk_sel)
+  begin
+    ex1_div            <= idu_vfpu_rf_pipex_func[0];
+    ex1_sqrt           <= idu_vfpu_rf_pipex_func[1];
+    ex1_double         <= idu_vfpu_rf_pipex_func[16];
+    ex1_single         <= idu_vfpu_rf_pipex_func[15];
+  end
+end
+assign ex1_scalar         = 1'b1;
+assign ex1_static_rm[2:0] = dp_vfdsu_ex1_pipex_imm0[2:0]; 
+// &Force("output","ex1_div"); @61
+// &Force("output","ex1_sqrt"); @62
+// &Force("output","ex1_double"); @63
+// &Force("output","ex1_single"); @64
+
+assign ex1_src0[63:0]    = dp_vfdsu_ex1_pipex_srcf0[63:0];
+assign ex1_src1[63:0]    = dp_vfdsu_ex1_pipex_srcf1[63:0];
+
+
+always @(posedge ex1_data_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex2_dst_ereg[4:0] <= 5'b0;
+    vfdsu_ex2_dst_vreg[6:0] <= 7'b0;
+    vfdsu_ex2_iid[6:0]      <= 7'b0;
+    vfdsu_ex2_double        <= 1'b0;
+    vfdsu_ex2_single        <= 1'b0;
+    vfdsu_ex2_div           <=  1'b0;
+    vfdsu_ex2_sqrt          <=  1'b0;
+  end
+  else if(ex1_pipedown)
+  begin
+    vfdsu_ex2_dst_ereg[4:0] <= dp_vfdsu_ex1_pipex_dst_ereg[4:0];
+    vfdsu_ex2_dst_vreg[6:0] <= dp_vfdsu_ex1_pipex_dst_vreg[6:0];
+    vfdsu_ex2_iid[6:0]      <= dp_vfdsu_ex1_pipex_iid[6:0];
+    vfdsu_ex2_double        <= ex1_double;
+    vfdsu_ex2_single        <= ex1_single;
+    vfdsu_ex2_div           <= ex1_div;
+    vfdsu_ex2_sqrt          <= ex1_sqrt;
+  end
+  else
+  begin
+    vfdsu_ex2_dst_ereg[4:0] <= vfdsu_ex2_dst_ereg[4:0];
+    vfdsu_ex2_dst_vreg[6:0] <= vfdsu_ex2_dst_vreg[6:0];
+    vfdsu_ex2_iid[6:0]      <= vfdsu_ex2_iid[6:0];
+    vfdsu_ex2_double        <= vfdsu_ex2_double;
+    vfdsu_ex2_single        <= vfdsu_ex2_single;
+    vfdsu_ex2_div           <= vfdsu_ex2_div;
+    vfdsu_ex2_sqrt          <= vfdsu_ex2_sqrt;
+  end
+end
+// &Force("output","vfdsu_ex2_double"); @103
+// &Force("output","vfdsu_ex2_single"); @104
+// //&Force("output","vfdsu_ex2_div"); @105
+// //&Force("output","vfdsu_ex2_sqrt"); @106
+
+
+always @(posedge ex2_data_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex3_dst_ereg[4:0] <= 5'b0;
+    vfdsu_ex3_dst_vreg[6:0] <= 7'b0;
+    vfdsu_ex3_iid[6:0]      <= 7'b0;
+//    vfdsu_ex3_double        <= 1'b0;
+//    vfdsu_ex3_single        <= 1'b0;    
+//    vfdsu_ex3_div           <= 1'b0;
+//    vfdsu_ex3_sqrt          <= 1'b0;
+  end
+  else if(ex2_pipedown)
+  begin
+    vfdsu_ex3_dst_ereg[4:0] <= vfdsu_ex2_dst_ereg[4:0];
+    vfdsu_ex3_dst_vreg[6:0] <= vfdsu_ex2_dst_vreg[6:0];
+    vfdsu_ex3_iid[6:0]      <= vfdsu_ex2_iid[6:0];
+ //   vfdsu_ex3_double        <= vfdsu_ex2_double;
+//    vfdsu_ex3_single        <= vfdsu_ex2_single;
+//    vfdsu_ex3_div           <= vfdsu_ex2_div;
+//    vfdsu_ex3_sqrt          <= vfdsu_ex2_sqrt;
+  end
+  else
+  begin
+    vfdsu_ex3_dst_ereg[4:0] <= vfdsu_ex3_dst_ereg[4:0];
+    vfdsu_ex3_dst_vreg[6:0] <= vfdsu_ex3_dst_vreg[6:0];
+    vfdsu_ex3_iid[6:0]      <= vfdsu_ex3_iid[6:0];
+//    vfdsu_ex3_double        <= vfdsu_ex3_double;
+//    vfdsu_ex3_single        <= vfdsu_ex3_single;
+//    vfdsu_ex3_div           <= vfdsu_ex3_div;
+//    vfdsu_ex3_sqrt          <= vfdsu_ex3_sqrt;
+  end
+end
+// //&Force("output","vfdsu_ex3_double"); @142
+// //&Force("output","vfdsu_ex3_single"); @143
+
+always @(posedge ex3_data_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex4_dst_ereg[4:0] <= 5'b0;
+    vfdsu_ex4_dst_vreg[6:0] <= 7'b0;
+    vfdsu_ex4_iid[6:0]      <= 7'b0;
+//    vfdsu_ex4_double        <= 1'b0;
+//    vfdsu_ex4_single        <= 1'b0;
+//    vfdsu_ex4_div           <= 1'b0;
+//    vfdsu_ex4_sqrt          <= 1'b0;
+  end
+  else if(ex3_pipedown)
+  begin
+    vfdsu_ex4_dst_ereg[4:0] <= vfdsu_ex3_dst_ereg[4:0];
+    vfdsu_ex4_dst_vreg[6:0] <= vfdsu_ex3_dst_vreg[6:0];
+    vfdsu_ex4_iid[6:0]      <= vfdsu_ex3_iid[6:0];
+//    vfdsu_ex4_double        <= vfdsu_ex3_double;
+//    vfdsu_ex4_single        <= vfdsu_ex3_single;
+//    vfdsu_ex4_div           <= vfdsu_ex3_div;
+//    vfdsu_ex4_sqrt          <= vfdsu_ex3_sqrt;
+  end
+  else
+  begin
+    vfdsu_ex4_dst_ereg[4:0] <= vfdsu_ex4_dst_ereg[4:0];
+    vfdsu_ex4_dst_vreg[6:0] <= vfdsu_ex4_dst_vreg[6:0];
+    vfdsu_ex4_iid[6:0]      <= vfdsu_ex4_iid[6:0];
+//    vfdsu_ex4_double        <= vfdsu_ex4_double;
+//    vfdsu_ex4_single        <= vfdsu_ex4_single;
+//    vfdsu_ex4_div           <= vfdsu_ex4_div;
+//    vfdsu_ex4_sqrt          <= vfdsu_ex4_sqrt;
+  end
+end
+// //&Force("output","vfdsu_ex4_double"); @178
+// //&Force("output","vfdsu_ex4_single"); @179
+
+
+assign pipex_dp_vfdsu_ereg_data[4:0]   = ex4_out_expt[4:0];
+assign pipex_dp_vfdsu_freg_data[63:0]  = ex4_out_result[63:0];
+assign pipex_dp_vfdsu_ereg[4:0]        = vfdsu_ex4_dst_ereg[4:0];
+assign pipex_dp_vfdsu_vreg[6:0]        = vfdsu_ex4_dst_vreg[6:0];
+
+
+
+
+
+
+// &ModuleEnd; @192
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
new file mode 100644
index 00000000..cdeb3a30
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
@@ -0,0 +1,691 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_srt(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  ex1_div,
+  ex1_divisor,
+  ex1_pipedown,
+  ex1_remainder,
+  ex1_sqrt,
+  ex2_pipedown,
+  ex2_srt_first_round,
+  forever_cpuclk,
+  pad_yy_icg_scan_en,
+  srt_ctrl_rem_zero,
+  srt_ctrl_skip_srt,
+  srt_secd_round,
+  srt_sm_on,
+  total_qt_rt_58,
+  vfdsu_ex2_div,
+  vfdsu_ex2_double,
+  vfdsu_ex2_dz,
+  vfdsu_ex2_expnt_add0,
+  vfdsu_ex2_expnt_add1,
+  vfdsu_ex2_nv,
+  vfdsu_ex2_of_rm_lfn,
+  vfdsu_ex2_op0_norm,
+  vfdsu_ex2_op1_norm,
+  vfdsu_ex2_qnan_f,
+  vfdsu_ex2_qnan_sign,
+  vfdsu_ex2_result_inf,
+  vfdsu_ex2_result_qnan,
+  vfdsu_ex2_result_sign,
+  vfdsu_ex2_result_zero,
+  vfdsu_ex2_rm,
+  vfdsu_ex2_single,
+  vfdsu_ex2_sqrt,
+  vfdsu_ex2_srt_skip,
+  vfdsu_ex3_doub_expnt_rst,
+  vfdsu_ex3_double,
+  vfdsu_ex3_dz,
+  vfdsu_ex3_half_expnt_rst,
+  vfdsu_ex3_id_srt_skip,
+  vfdsu_ex3_nv,
+  vfdsu_ex3_of,
+  vfdsu_ex3_potnt_of,
+  vfdsu_ex3_potnt_uf,
+  vfdsu_ex3_qnan_f,
+  vfdsu_ex3_qnan_sign,
+  vfdsu_ex3_rem_sign,
+  vfdsu_ex3_rem_zero,
+  vfdsu_ex3_result_denorm_round_add_num,
+  vfdsu_ex3_result_inf,
+  vfdsu_ex3_result_lfn,
+  vfdsu_ex3_result_qnan,
+  vfdsu_ex3_result_sign,
+  vfdsu_ex3_result_zero,
+  vfdsu_ex3_rm,
+  vfdsu_ex3_rslt_denorm,
+  vfdsu_ex3_sing_expnt_rst,
+  vfdsu_ex3_single,
+  vfdsu_ex3_uf
+);
+
+// &Ports; @23
+input           cp0_vfpu_icg_en;                       
+input           cp0_yy_clk_en;                         
+input           cpurst_b;                              
+input           ex1_div;                               
+input   [52:0]  ex1_divisor;                           
+input           ex1_pipedown;                          
+input   [59:0]  ex1_remainder;                         
+input           ex1_sqrt;                              
+input           ex2_pipedown;                          
+input           ex2_srt_first_round;                   
+input           forever_cpuclk;                        
+input           pad_yy_icg_scan_en;                    
+input           srt_secd_round;                        
+input           srt_sm_on;                             
+input           vfdsu_ex2_div;                         
+input           vfdsu_ex2_double;                      
+input           vfdsu_ex2_dz;                          
+input   [12:0]  vfdsu_ex2_expnt_add0;                  
+input   [12:0]  vfdsu_ex2_expnt_add1;                  
+input           vfdsu_ex2_nv;                          
+input           vfdsu_ex2_of_rm_lfn;                   
+input           vfdsu_ex2_op0_norm;                    
+input           vfdsu_ex2_op1_norm;                    
+input   [51:0]  vfdsu_ex2_qnan_f;                      
+input           vfdsu_ex2_qnan_sign;                   
+input           vfdsu_ex2_result_inf;                  
+input           vfdsu_ex2_result_qnan;                 
+input           vfdsu_ex2_result_sign;                 
+input           vfdsu_ex2_result_zero;                 
+input   [2 :0]  vfdsu_ex2_rm;                          
+input           vfdsu_ex2_single;                      
+input           vfdsu_ex2_sqrt;                        
+input           vfdsu_ex2_srt_skip;                    
+output          srt_ctrl_rem_zero;                     
+output          srt_ctrl_skip_srt;                     
+output  [57:0]  total_qt_rt_58;                        
+output  [12:0]  vfdsu_ex3_doub_expnt_rst;              
+output          vfdsu_ex3_double;                      
+output          vfdsu_ex3_dz;                          
+output  [12:0]  vfdsu_ex3_half_expnt_rst;              
+output          vfdsu_ex3_id_srt_skip;                 
+output          vfdsu_ex3_nv;                          
+output          vfdsu_ex3_of;                          
+output          vfdsu_ex3_potnt_of;                    
+output          vfdsu_ex3_potnt_uf;                    
+output  [51:0]  vfdsu_ex3_qnan_f;                      
+output          vfdsu_ex3_qnan_sign;                   
+output          vfdsu_ex3_rem_sign;                    
+output          vfdsu_ex3_rem_zero;                    
+output  [52:0]  vfdsu_ex3_result_denorm_round_add_num; 
+output          vfdsu_ex3_result_inf;                  
+output          vfdsu_ex3_result_lfn;                  
+output          vfdsu_ex3_result_qnan;                 
+output          vfdsu_ex3_result_sign;                 
+output          vfdsu_ex3_result_zero;                 
+output  [2 :0]  vfdsu_ex3_rm;                          
+output          vfdsu_ex3_rslt_denorm;                 
+output  [8 :0]  vfdsu_ex3_sing_expnt_rst;              
+output          vfdsu_ex3_single;                      
+output          vfdsu_ex3_uf;                          
+
+// &Regs; @24
+reg     [52:0]  ex2_result_double_denorm_round_add_num; 
+reg     [52:0]  ex2_result_half_denorm_round_add_num;  
+reg     [52:0]  ex2_result_single_denorm_round_add_num; 
+reg     [12:0]  vfdsu_ex3_doub_expnt_rst;              
+reg             vfdsu_ex3_double;                      
+reg             vfdsu_ex3_dz;                          
+reg     [12:0]  vfdsu_ex3_half_expnt_rst;              
+reg             vfdsu_ex3_id_srt_skip;                 
+reg             vfdsu_ex3_nv;                          
+reg             vfdsu_ex3_of;                          
+reg             vfdsu_ex3_potnt_of;                    
+reg             vfdsu_ex3_potnt_uf;                    
+reg     [51:0]  vfdsu_ex3_qnan_f;                      
+reg             vfdsu_ex3_qnan_sign;                   
+reg             vfdsu_ex3_rem_sign;                    
+reg     [52:0]  vfdsu_ex3_result_denorm_round_add_num; 
+reg             vfdsu_ex3_result_inf;                  
+reg             vfdsu_ex3_result_lfn;                  
+reg             vfdsu_ex3_result_qnan;                 
+reg             vfdsu_ex3_result_sign;                 
+reg             vfdsu_ex3_result_zero;                 
+reg     [2 :0]  vfdsu_ex3_rm;                          
+reg             vfdsu_ex3_rslt_denorm;                 
+reg     [8 :0]  vfdsu_ex3_sing_expnt_rst;              
+reg             vfdsu_ex3_single;                      
+reg             vfdsu_ex3_uf;                          
+
+// &Wires; @25
+wire            cp0_vfpu_icg_en;                       
+wire            cp0_yy_clk_en;                         
+wire            cpurst_b;                              
+wire            ex1_div;                               
+wire    [52:0]  ex1_divisor;                           
+wire            ex1_pipedown;                          
+wire    [59:0]  ex1_remainder;                         
+wire            ex1_sqrt;                              
+wire            ex2_div_of;                            
+wire            ex2_div_uf;                            
+wire            ex2_doub_expnt_of;                     
+wire            ex2_doub_expnt_uf;                     
+wire            ex2_doub_potnt_of;                     
+wire            ex2_doub_potnt_uf;                     
+wire            ex2_double_id_nor_srt_skip;            
+wire            ex2_expnt_of;                          
+wire    [12:0]  ex2_expnt_result;                      
+wire            ex2_expnt_uf;                          
+wire            ex2_half_expnt_of;                     
+wire            ex2_half_expnt_uf;                     
+wire            ex2_half_id_nor_srt_skip;              
+wire            ex2_half_potnt_of;                     
+wire            ex2_half_potnt_uf;                     
+wire            ex2_id_nor_srt_skip;                   
+wire            ex2_of;                                
+wire            ex2_of_plus;                           
+wire            ex2_pipe_clk;                          
+wire            ex2_pipe_clk_en;                       
+wire            ex2_pipedown;                          
+wire            ex2_potnt_of;                          
+wire            ex2_potnt_of_pre;                      
+wire            ex2_potnt_uf;                          
+wire            ex2_potnt_uf_pre;                      
+wire    [52:0]  ex2_result_denorm_round_add_num;       
+wire            ex2_result_inf;                        
+wire            ex2_result_lfn;                        
+wire            ex2_result_qnan;                       
+wire            ex2_result_zero;                       
+wire            ex2_rslt_denorm;                       
+wire            ex2_sing_expnt_of;                     
+wire            ex2_sing_expnt_uf;                     
+wire            ex2_sing_potnt_of;                     
+wire            ex2_sing_potnt_uf;                     
+wire            ex2_single_id_nor_srt_skip;            
+wire    [12:0]  ex2_sqrt_expnt_result;                 
+wire            ex2_srt_first_round;                   
+wire            ex2_uf;                                
+wire            ex2_uf_plus;                           
+wire            forever_cpuclk;                        
+wire    [6 :0]  initial_bound_sel_in;                  
+wire    [55:0]  initial_divisor_in;                    
+wire    [60:0]  initial_remainder_in;                  
+wire            initial_srt_en;                        
+wire            initial_srt_sel_div_in;                
+wire            initial_srt_sel_sqrt_in;               
+wire            pad_yy_icg_scan_en;                    
+wire            srt_ctrl_rem_zero;                     
+wire            srt_ctrl_skip_srt;                     
+wire            srt_first_round;                       
+wire    [60:0]  srt_remainder;                         
+wire    [59:0]  srt_remainder_out;                     
+wire            srt_remainder_sign;                    
+wire            srt_secd_round;                        
+wire            srt_sm_on;                             
+wire    [57:0]  total_qt_rt;                           
+wire    [57:0]  total_qt_rt_58;                        
+wire    [57:0]  vdiv_qt_rt;                            
+wire            vfdsu_ex2_div;                         
+wire            vfdsu_ex2_double;                      
+wire            vfdsu_ex2_dz;                          
+wire    [12:0]  vfdsu_ex2_expnt_add0;                  
+wire    [12:0]  vfdsu_ex2_expnt_add1;                  
+wire    [12:0]  vfdsu_ex2_expnt_rst;                   
+wire            vfdsu_ex2_nv;                          
+wire            vfdsu_ex2_of_rm_lfn;                   
+wire            vfdsu_ex2_op0_norm;                    
+wire            vfdsu_ex2_op1_norm;                    
+wire    [51:0]  vfdsu_ex2_qnan_f;                      
+wire            vfdsu_ex2_qnan_sign;                   
+wire            vfdsu_ex2_result_inf;                  
+wire            vfdsu_ex2_result_qnan;                 
+wire            vfdsu_ex2_result_sign;                 
+wire            vfdsu_ex2_result_zero;                 
+wire    [2 :0]  vfdsu_ex2_rm;                          
+wire            vfdsu_ex2_single;                      
+wire            vfdsu_ex2_sqrt;                        
+wire            vfdsu_ex2_srt_skip;                    
+wire            vfdsu_ex3_rem_zero;                    
+
+
+//====================EX2 Expt info=========================
+//EX1 only detect of/uf under id condition
+//EX2 will deal with other condition
+
+//When input is normal, overflow when E1-E2 > 128/1024
+//here we mov the expnt result calculation into second stage
+
+assign vfdsu_ex2_expnt_rst[12:0] =  (vfdsu_ex2_sqrt)
+                                    ? ex2_sqrt_expnt_result[12:0]
+                                    : ex2_expnt_result[12:0];
+assign ex2_sqrt_expnt_result[12:0] = {ex2_expnt_result[12],
+                                      ex2_expnt_result[12:1]};
+assign ex2_expnt_result[12:0]  = vfdsu_ex2_expnt_add0[12:0] - vfdsu_ex2_expnt_add1[12:0];
+assign ex2_doub_expnt_of = ~vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11] 
+                                                        || (vfdsu_ex2_expnt_rst[10] &&
+                                                            |vfdsu_ex2_expnt_rst[9:0]));
+assign ex2_sing_expnt_of = ~vfdsu_ex2_expnt_rst[9] && (vfdsu_ex2_expnt_rst[8] 
+                                                      || (vfdsu_ex2_expnt_rst[7]  &&
+                                                          |vfdsu_ex2_expnt_rst[6:0]));
+
+assign ex2_half_expnt_of = ~vfdsu_ex2_expnt_rst[6] && (vfdsu_ex2_expnt_rst[5] 
+                                                      || (vfdsu_ex2_expnt_rst[4]  &&
+                                                          |vfdsu_ex2_expnt_rst[3:0]));
+assign ex2_expnt_of      = vfdsu_ex2_double ? ex2_doub_expnt_of :
+                                              vfdsu_ex2_single  ? ex2_sing_expnt_of
+                                                                : ex2_half_expnt_of;
+assign ex2_potnt_of_pre  = vfdsu_ex2_double ? ex2_doub_potnt_of :
+                           vfdsu_ex2_single ? ex2_sing_potnt_of : ex2_half_potnt_of;   
+assign ex2_potnt_uf_pre  = vfdsu_ex2_double ? ex2_doub_potnt_uf : 
+                           vfdsu_ex2_single ? ex2_sing_potnt_uf : ex2_half_potnt_uf;
+assign ex2_expnt_uf      = vfdsu_ex2_double ? ex2_doub_expnt_uf :
+                           vfdsu_ex2_single ? ex2_sing_expnt_uf : ex2_half_expnt_uf;
+assign ex2_id_nor_srt_skip   = vfdsu_ex2_double ? ex2_double_id_nor_srt_skip :
+                               vfdsu_ex2_single ? ex2_single_id_nor_srt_skip
+                                                : ex2_half_id_nor_srt_skip; 
+assign ex2_result_denorm_round_add_num[52:0] = vfdsu_ex2_double ? 
+                                               ex2_result_double_denorm_round_add_num[52:0] :
+                                               vfdsu_ex2_single ? 
+                                               ex2_result_single_denorm_round_add_num[52:0] :
+                                               ex2_result_half_denorm_round_add_num[52:0];
+                                             
+                                                      
+//potential overflow when E1-E2 = 128/1024
+assign ex2_doub_potnt_of = ~vfdsu_ex2_expnt_rst[12] && 
+                           ~vfdsu_ex2_expnt_rst[11] &&
+                            vfdsu_ex2_expnt_rst[10] &&
+                          ~|vfdsu_ex2_expnt_rst[9:0];
+assign ex2_sing_potnt_of = ~vfdsu_ex2_expnt_rst[9]  &&
+                           ~vfdsu_ex2_expnt_rst[8]  &&
+                            vfdsu_ex2_expnt_rst[7]  &&
+                          ~|vfdsu_ex2_expnt_rst[6:0];
+assign ex2_half_potnt_of = ~vfdsu_ex2_expnt_rst[6]  &&
+                           ~vfdsu_ex2_expnt_rst[5]  &&
+                            vfdsu_ex2_expnt_rst[4]  &&
+                          ~|vfdsu_ex2_expnt_rst[3:0];  
+assign ex2_potnt_of      = ex2_potnt_of_pre && 
+                           vfdsu_ex2_op0_norm && 
+                           vfdsu_ex2_op1_norm && 
+                           vfdsu_ex2_div;
+
+//When input is normal, underflow when E1-E2 <= -127/-1023/-15
+assign ex2_doub_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hc01);
+assign ex2_sing_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hf81);
+assign ex2_half_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hff1);
+assign ex2_half_potnt_uf = &vfdsu_ex2_expnt_rst[6:4]   &&
+                          ~|vfdsu_ex2_expnt_rst[3:2]   &&
+                            vfdsu_ex2_expnt_rst[1]     &&
+                           !vfdsu_ex2_expnt_rst[0];
+
+
+//potential underflow when E1-E2 = -126/-1022
+assign ex2_doub_potnt_uf = &vfdsu_ex2_expnt_rst[12:10] &&
+                          ~|vfdsu_ex2_expnt_rst[9:2]   &&
+                            vfdsu_ex2_expnt_rst[1]     && 
+                           !vfdsu_ex2_expnt_rst[0];
+assign ex2_sing_potnt_uf = &vfdsu_ex2_expnt_rst[9:7]   &&
+                          ~|vfdsu_ex2_expnt_rst[6:2]   &&
+                            vfdsu_ex2_expnt_rst[1]     &&
+                           !vfdsu_ex2_expnt_rst[0];
+
+assign ex2_potnt_uf      = (ex2_potnt_uf_pre && 
+                            vfdsu_ex2_op0_norm && 
+                            vfdsu_ex2_op1_norm &&
+                            vfdsu_ex2_div)     ||
+                           (ex2_potnt_uf_pre   && 
+                            vfdsu_ex2_op0_norm);
+
+//========================EX2 Overflow======================
+//ex2 overflow when 
+//  1.op0 & op1 both norm && expnt overflow
+//  2.ex1_id_of
+assign ex2_of      = ex2_of_plus;
+assign ex2_of_plus = ex2_div_of  && vfdsu_ex2_div; 
+assign ex2_div_of  = vfdsu_ex2_op0_norm && 
+                     vfdsu_ex2_op1_norm && 
+                     ex2_expnt_of;
+
+//=======================EX2 Underflow======================
+//ex2 underflow when 
+//  1.op0 & op1 both norm && expnt underflow
+//  2.ex1_id_uf
+//  and detect when to skip the srt, here, we have further optmization
+assign ex2_uf      = ex2_uf_plus;
+assign ex2_uf_plus = ex2_div_uf  && vfdsu_ex2_div; 
+assign ex2_div_uf  = vfdsu_ex2_op0_norm && 
+                     vfdsu_ex2_op1_norm && 
+                     ex2_expnt_uf;
+assign ex2_double_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12] 
+                                     && (vfdsu_ex2_expnt_rst[11:0]<12'hbcd);
+assign ex2_single_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12] 
+                                     && (vfdsu_ex2_expnt_rst[11:0]<12'hf6a);
+assign ex2_half_id_nor_srt_skip   =  vfdsu_ex2_expnt_rst[12] 
+                                     && (vfdsu_ex2_expnt_rst[11:0]<12'hfe7);
+assign ex2_rslt_denorm            = ex2_uf;
+
+//=======================EX2 skip srt iteration======================
+assign srt_ctrl_skip_srt   =  ex2_of || ex2_id_nor_srt_skip
+                                     || vfdsu_ex2_srt_skip;
+//===============ex2 round prepare for denormal round======
+// &CombBeg; @146
+always @( vfdsu_ex2_expnt_rst[12:0])
+begin
+case(vfdsu_ex2_expnt_rst[12:0])
+  13'h1c02:ex2_result_double_denorm_round_add_num[52:0] = 53'h1; //-1022 1
+  13'h1c01:ex2_result_double_denorm_round_add_num[52:0] = 53'h2; //-1023 0
+  13'h1c00:ex2_result_double_denorm_round_add_num[52:0] = 53'h4; //-1024 -1
+  13'h1bff:ex2_result_double_denorm_round_add_num[52:0] = 53'h8; //-1025 -2
+  13'h1bfe:ex2_result_double_denorm_round_add_num[52:0] = 53'h10; //-1026 -3
+  13'h1bfd:ex2_result_double_denorm_round_add_num[52:0] = 53'h20; //-1027 -4
+  13'h1bfc:ex2_result_double_denorm_round_add_num[52:0] = 53'h40; //-1028 -5
+  13'h1bfb:ex2_result_double_denorm_round_add_num[52:0] = 53'h80; //-1029 -6
+  13'h1bfa:ex2_result_double_denorm_round_add_num[52:0] = 53'h100; //-1030 -7
+  13'h1bf9:ex2_result_double_denorm_round_add_num[52:0] = 53'h200; //-1031 -8
+  13'h1bf8:ex2_result_double_denorm_round_add_num[52:0] = 53'h400; //-1032 -9
+  13'h1bf7:ex2_result_double_denorm_round_add_num[52:0] = 53'h800; //-1033 -10
+  13'h1bf6:ex2_result_double_denorm_round_add_num[52:0] = 53'h1000; //-1034 -11
+  13'h1bf5:ex2_result_double_denorm_round_add_num[52:0] = 53'h2000; //-1035 -12
+  13'h1bf4:ex2_result_double_denorm_round_add_num[52:0] = 53'h4000; //-1036 -13   
+  13'h1bf3:ex2_result_double_denorm_round_add_num[52:0] = 53'h8000; // -1037
+  13'h1bf2:ex2_result_double_denorm_round_add_num[52:0] = 53'h10000;//-1038
+  13'h1bf1:ex2_result_double_denorm_round_add_num[52:0] = 53'h20000;//-1039
+  13'h1bf0:ex2_result_double_denorm_round_add_num[52:0] = 53'h40000; //-1040
+  13'h1bef:ex2_result_double_denorm_round_add_num[52:0] = 53'h80000; //-1041
+  13'h1bee:ex2_result_double_denorm_round_add_num[52:0] = 53'h100000; //-1042
+  13'h1bed:ex2_result_double_denorm_round_add_num[52:0] = 53'h200000; //-1043
+  13'h1bec:ex2_result_double_denorm_round_add_num[52:0] = 53'h400000; //-1044
+  13'h1beb:ex2_result_double_denorm_round_add_num[52:0] = 53'h800000; //-1045
+  13'h1bea:ex2_result_double_denorm_round_add_num[52:0] = 53'h1000000;//-1046
+  13'h1be9:ex2_result_double_denorm_round_add_num[52:0] = 53'h2000000;//-1047
+  13'h1be8:ex2_result_double_denorm_round_add_num[52:0] = 53'h4000000; //-1048
+  13'h1be7:ex2_result_double_denorm_round_add_num[52:0] = 53'h8000000; //-1049
+  13'h1be6:ex2_result_double_denorm_round_add_num[52:0] = 53'h10000000;//-1050
+  13'h1be5:ex2_result_double_denorm_round_add_num[52:0] = 53'h20000000; //-1051
+  13'h1be4:ex2_result_double_denorm_round_add_num[52:0] = 53'h40000000; //-1052
+  13'h1be3:ex2_result_double_denorm_round_add_num[52:0] = 53'h80000000; //-1053
+  13'h1be2:ex2_result_double_denorm_round_add_num[52:0] = 53'h100000000; //-1054
+  13'h1be1:ex2_result_double_denorm_round_add_num[52:0] = 53'h200000000; //-1055
+  13'h1be0:ex2_result_double_denorm_round_add_num[52:0] = 53'h400000000; //-1056
+  13'h1bdf:ex2_result_double_denorm_round_add_num[52:0] = 53'h800000000; //-1057
+  13'h1bde:ex2_result_double_denorm_round_add_num[52:0] = 53'h1000000000; //-1058
+  13'h1bdd:ex2_result_double_denorm_round_add_num[52:0] = 53'h2000000000; //-1059
+  13'h1bdc:ex2_result_double_denorm_round_add_num[52:0] = 53'h4000000000; //-1060
+  13'h1bdb:ex2_result_double_denorm_round_add_num[52:0] = 53'h8000000000; //-1061
+  13'h1bda:ex2_result_double_denorm_round_add_num[52:0] = 53'h10000000000; //-1062
+  13'h1bd9:ex2_result_double_denorm_round_add_num[52:0] = 53'h20000000000; //-1063
+  13'h1bd8:ex2_result_double_denorm_round_add_num[52:0] = 53'h40000000000; //-1064
+  13'h1bd7:ex2_result_double_denorm_round_add_num[52:0] = 53'h80000000000; //-1065
+  13'h1bd6:ex2_result_double_denorm_round_add_num[52:0] = 53'h100000000000; //-1066
+  13'h1bd5:ex2_result_double_denorm_round_add_num[52:0] = 53'h200000000000; //-1067
+  13'h1bd4:ex2_result_double_denorm_round_add_num[52:0] = 53'h400000000000; //-1068
+  13'h1bd3:ex2_result_double_denorm_round_add_num[52:0] = 53'h800000000000; //-1069
+  13'h1bd2:ex2_result_double_denorm_round_add_num[52:0] = 53'h1000000000000;//-1070
+  13'h1bd1:ex2_result_double_denorm_round_add_num[52:0] = 53'h2000000000000; //-1071
+  13'h1bd0:ex2_result_double_denorm_round_add_num[52:0] = 53'h4000000000000; //-1072
+  13'h1bcf:ex2_result_double_denorm_round_add_num[52:0] = 53'h8000000000000; //-1073
+  13'h1bce:ex2_result_double_denorm_round_add_num[52:0] = 53'h10000000000000; //-1073
+  default: ex2_result_double_denorm_round_add_num[52:0] = 53'h0;
+endcase
+// &CombEnd; @203
+end
+// &CombBeg; @204
+always @( vfdsu_ex2_expnt_rst[12:0])
+begin
+case(vfdsu_ex2_expnt_rst[12:0])
+  13'h1f82:ex2_result_single_denorm_round_add_num[52:0] = 53'h20000000; //-126 1
+  13'h1f81:ex2_result_single_denorm_round_add_num[52:0] = 53'h40000000; //-127 0
+  13'h1f80:ex2_result_single_denorm_round_add_num[52:0] = 53'h80000000; //-128 -1
+  13'h1f7f:ex2_result_single_denorm_round_add_num[52:0] = 53'h100000000; //-129 -2
+  13'h1f7e:ex2_result_single_denorm_round_add_num[52:0] = 53'h200000000; //-130 -3
+  13'h1f7d:ex2_result_single_denorm_round_add_num[52:0] = 53'h400000000; //-131 -4
+  13'h1f7c:ex2_result_single_denorm_round_add_num[52:0] = 53'h800000000; //-132 -5
+  13'h1f7b:ex2_result_single_denorm_round_add_num[52:0] = 53'h1000000000; //-133 -6
+  13'h1f7a:ex2_result_single_denorm_round_add_num[52:0] = 53'h2000000000; //-134 -7
+  13'h1f79:ex2_result_single_denorm_round_add_num[52:0] = 53'h4000000000; //-135 -8
+  13'h1f78:ex2_result_single_denorm_round_add_num[52:0] = 53'h8000000000; //-136 -9
+  13'h1f77:ex2_result_single_denorm_round_add_num[52:0] = 53'h10000000000; //-137 -10
+  13'h1f76:ex2_result_single_denorm_round_add_num[52:0] = 53'h20000000000; //-138 -11
+  13'h1f75:ex2_result_single_denorm_round_add_num[52:0] = 53'h40000000000; //-139 -12
+  13'h1f74:ex2_result_single_denorm_round_add_num[52:0] = 53'h80000000000; //-140 -13   
+  13'h1f73:ex2_result_single_denorm_round_add_num[52:0] = 53'h100000000000; // -141 -14
+  13'h1f72:ex2_result_single_denorm_round_add_num[52:0] = 53'h200000000000;//-142  -15
+  13'h1f71:ex2_result_single_denorm_round_add_num[52:0] = 53'h400000000000;//-143 -16
+  13'h1f70:ex2_result_single_denorm_round_add_num[52:0] = 53'h800000000000; //-144 -17
+  13'h1f6f:ex2_result_single_denorm_round_add_num[52:0] = 53'h1000000000000; //-145 -18
+  13'h1f6e:ex2_result_single_denorm_round_add_num[52:0] = 53'h2000000000000; //-146 -19
+  13'h1f6d:ex2_result_single_denorm_round_add_num[52:0] = 53'h4000000000000; //-147 -20
+  13'h1f6c:ex2_result_single_denorm_round_add_num[52:0] = 53'h8000000000000; //-148 -21
+  13'h1f6b:ex2_result_single_denorm_round_add_num[52:0] = 53'h10000000000000; //-148 -22
+  default: ex2_result_single_denorm_round_add_num[52:0] = 53'h0;  // -23
+endcase
+// &CombEnd; @232
+end
+// &CombBeg; @233
+always @( vfdsu_ex2_expnt_rst[12:0])
+begin
+case(vfdsu_ex2_expnt_rst[12:0])
+  13'h1ff2:ex2_result_half_denorm_round_add_num[52:0] = 53'h40000000000; //-14 1
+  13'h1ff1:ex2_result_half_denorm_round_add_num[52:0] = 53'h80000000000; //-15 0
+  13'h1ff0:ex2_result_half_denorm_round_add_num[52:0] = 53'h100000000000; //-16 -1
+  13'h1fef:ex2_result_half_denorm_round_add_num[52:0] = 53'h200000000000; //-17 -2
+  13'h1fee:ex2_result_half_denorm_round_add_num[52:0] = 53'h400000000000; //-18 -3
+  13'h1fed:ex2_result_half_denorm_round_add_num[52:0] = 53'h800000000000; //-19 -4
+  13'h1fec:ex2_result_half_denorm_round_add_num[52:0] = 53'h1000000000000; //-20 -5
+  13'h1feb:ex2_result_half_denorm_round_add_num[52:0] = 53'h2000000000000; //-21 -6
+  13'h1fea:ex2_result_half_denorm_round_add_num[52:0] = 53'h4000000000000; //-22 -7
+  13'h1fe9:ex2_result_half_denorm_round_add_num[52:0] = 53'h8000000000000; //-23 -8
+  13'h1fe8:ex2_result_half_denorm_round_add_num[52:0] = 53'h10000000000000; //-24 -9
+  default: ex2_result_half_denorm_round_add_num[52:0] = 53'h0;  // -23
+endcase
+// &CombEnd; @248
+end
+
+//===================special result========================
+assign ex2_result_zero = vfdsu_ex2_result_zero;
+assign ex2_result_qnan = vfdsu_ex2_result_qnan;
+assign ex2_result_inf  = vfdsu_ex2_result_inf || 
+                         ex2_of_plus && !vfdsu_ex2_of_rm_lfn;
+assign ex2_result_lfn  =  
+                         ex2_of_plus &&  vfdsu_ex2_of_rm_lfn;
+
+
+
+//====================Pipe to EX3===========================
+//gate clk
+// &Instance("gated_clk_cell","x_ex2_pipe_clk"); @262
+gated_clk_cell  x_ex2_pipe_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (ex2_pipe_clk      ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (ex2_pipe_clk_en   ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @263
+//           .clk_out        (ex2_pipe_clk),//Out Clock @264
+//           .external_en    (1'b0), @265
+//           .global_en      (cp0_yy_clk_en), @266
+//           .local_en       (ex2_pipe_clk_en),//Local Condition @267
+//           .module_en      (cp0_vfpu_icg_en) @268
+//         ); @269
+assign ex2_pipe_clk_en = ex2_pipedown;
+
+always @(posedge ex2_pipe_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    vfdsu_ex3_result_zero     <= 1'b0;
+    vfdsu_ex3_result_qnan     <= 1'b0;
+    vfdsu_ex3_result_inf      <= 1'b0;
+    vfdsu_ex3_result_lfn      <= 1'b0;
+    vfdsu_ex3_of              <= 1'b0;
+    vfdsu_ex3_uf              <= 1'b0;
+    vfdsu_ex3_nv              <= 1'b0;
+    vfdsu_ex3_dz              <= 1'b0;
+    vfdsu_ex3_potnt_of        <= 1'b0;
+    vfdsu_ex3_potnt_uf        <= 1'b0;
+    vfdsu_ex3_rem_sign        <= 1'b0;
+//    vfdsu_ex3_rem_zero        <= 1'b0;
+    vfdsu_ex3_doub_expnt_rst[12:0] <= 13'b0;
+    vfdsu_ex3_sing_expnt_rst[8:0] <= 9'b0;
+    vfdsu_ex3_half_expnt_rst[12:0] <= 13'b0;
+    vfdsu_ex3_result_sign     <= 1'b0;
+    vfdsu_ex3_qnan_sign       <= 1'b0;    
+    vfdsu_ex3_qnan_f[51:0]    <= 52'b0;
+    vfdsu_ex3_rm[2:0]         <= 3'b0;
+    vfdsu_ex3_result_denorm_round_add_num[52:0] 
+                              <= 53'b0;
+    vfdsu_ex3_rslt_denorm     <= 1'b0;
+    vfdsu_ex3_id_srt_skip     <= 1'b0;
+    vfdsu_ex3_double          <=  1'b0;
+    vfdsu_ex3_single          <=  1'b0;
+  end
+  else if(ex2_pipedown)
+  begin
+    vfdsu_ex3_result_zero     <= ex2_result_zero; 
+    vfdsu_ex3_result_qnan     <= ex2_result_qnan;
+    vfdsu_ex3_result_inf      <= ex2_result_inf;
+    vfdsu_ex3_result_lfn      <= ex2_result_lfn; 
+    vfdsu_ex3_of              <= ex2_of;
+    vfdsu_ex3_uf              <= ex2_uf;
+    vfdsu_ex3_nv              <= vfdsu_ex2_nv;
+    vfdsu_ex3_dz              <= vfdsu_ex2_dz;
+    vfdsu_ex3_potnt_of        <= ex2_potnt_of;
+    vfdsu_ex3_potnt_uf        <= ex2_potnt_uf;
+    vfdsu_ex3_rem_sign        <= srt_remainder_sign;
+    //vfdsu_ex3_rem_zero        <= srt_remainder_zero;
+    vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
+    vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex2_expnt_rst[8:0];
+    vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
+    vfdsu_ex3_result_sign     <= vfdsu_ex2_result_sign;
+    vfdsu_ex3_qnan_sign       <= vfdsu_ex2_qnan_sign;    
+    vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex2_qnan_f[51:0];
+    vfdsu_ex3_rm[2:0]         <= vfdsu_ex2_rm[2:0];
+    vfdsu_ex3_result_denorm_round_add_num[52:0] 
+                              <= ex2_result_denorm_round_add_num[52:0];
+    vfdsu_ex3_rslt_denorm     <= ex2_rslt_denorm;
+    vfdsu_ex3_id_srt_skip     <= ex2_id_nor_srt_skip;
+    vfdsu_ex3_double          <= vfdsu_ex2_double;
+    vfdsu_ex3_single          <= vfdsu_ex2_single;
+  end
+  else
+  begin
+    vfdsu_ex3_result_zero     <= vfdsu_ex3_result_zero; 
+    vfdsu_ex3_result_qnan     <= vfdsu_ex3_result_qnan;
+    vfdsu_ex3_result_inf      <= vfdsu_ex3_result_inf;
+    vfdsu_ex3_result_lfn      <= vfdsu_ex3_result_lfn;
+    vfdsu_ex3_of              <= vfdsu_ex3_of;
+    vfdsu_ex3_uf              <= vfdsu_ex3_uf;
+    vfdsu_ex3_nv              <= vfdsu_ex3_nv;
+    vfdsu_ex3_dz              <= vfdsu_ex3_dz;
+    vfdsu_ex3_potnt_of        <= vfdsu_ex3_potnt_of;
+    vfdsu_ex3_potnt_uf        <= vfdsu_ex3_potnt_uf;
+    vfdsu_ex3_rem_sign        <= vfdsu_ex3_rem_sign;
+    //vfdsu_ex3_rem_zero        <= vfdsu_ex3_rem_zero;
+    vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex3_doub_expnt_rst[12:0];
+    vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex3_sing_expnt_rst[8:0];
+    vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex3_half_expnt_rst[12:0];
+    vfdsu_ex3_result_sign     <= vfdsu_ex3_result_sign;
+    vfdsu_ex3_qnan_sign       <= vfdsu_ex3_qnan_sign;     
+    vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex3_qnan_f[51:0];
+    vfdsu_ex3_rm[2:0]         <= vfdsu_ex3_rm[2:0];
+    vfdsu_ex3_result_denorm_round_add_num[52:0] 
+                              <= vfdsu_ex3_result_denorm_round_add_num[52:0];
+    vfdsu_ex3_rslt_denorm     <=  vfdsu_ex3_rslt_denorm;
+    vfdsu_ex3_id_srt_skip    <=  vfdsu_ex3_id_srt_skip;
+    vfdsu_ex3_double          <= vfdsu_ex3_double;
+    vfdsu_ex3_single          <= vfdsu_ex3_single;
+  end
+end
+assign vfdsu_ex3_rem_zero       =  ~|srt_remainder[60:0];
+assign srt_ctrl_rem_zero        =  vfdsu_ex3_rem_zero;
+// &Force("output","vfdsu_ex3_potnt_of"); @365
+// &Force("output","vfdsu_ex3_potnt_uf"); @366
+// &Force("output","vfdsu_ex3_rem_sign"); @367
+// &Force("output","vfdsu_ex3_rem_zero"); @368
+// &Force("output","vfdsu_ex3_result_zero"); @369
+// &Force("output","vfdsu_ex3_result_qnan"); @370
+// &Force("output","vfdsu_ex3_result_inf"); @371
+// &Force("output","vfdsu_ex3_result_lfn"); @372
+// &Force("output","vfdsu_ex3_dz"); @373
+// &Force("output","vfdsu_ex3_nv"); @374
+// &Force("output","vfdsu_ex3_of"); @375
+// &Force("output","vfdsu_ex3_uf"); @376
+// &Force("output","vfdsu_ex3_result_sign"); @377
+// &Force("output","vfdsu_ex3_doub_expnt_rst"); @378
+// &Force("output","vfdsu_ex3_sing_expnt_rst"); @379
+// &Force("output","vfdsu_ex3_half_expnt_rst"); @380
+// &Force("output","vfdsu_ex3_qnan_sign"); @381
+// &Force("output","vfdsu_ex3_qnan_f"); @382
+// &Force("output","vfdsu_ex3_rm"); @383
+// &Force("output","vfdsu_ex3_result_denorm_round_add_num"); @384
+// &Force("output","vfdsu_ex3_rslt_denorm"); @385
+// &Force("output","vfdsu_ex3_id_srt_skip"); @386
+// &Force("output","vfdsu_ex3_single"); @387
+// &Force("output","vfdsu_ex3_double"); @388
+
+//==========================================================
+//    SRT Remainder & Divisor for Quotient/Root Generate
+//==========================================================
+// &Instance("ct_vfdsu_srt_radix16_with_sqrt_for_vdsp"); @411
+// &Connect(.srt_sm_on    (srt_sm_on_all)); @412
+// &Force("bus","ex1_remainder",59,0); @414
+// &Force("bus","srt_remainder_out",69,0); @415
+// &Force("nonport","srt_remainder_out"); @422
+// &Force("nonport","vdiv_qt_rt"); @423
+assign initial_divisor_in[55:0]   = {ex1_divisor[52:0],3'b000}; 
+
+assign initial_remainder_in[60:0] = {2'b00,ex1_remainder[59:1]};
+
+assign initial_bound_sel_in[6:0]  = ex1_div ? initial_divisor_in[55:49]:{7{1'b0}};
+
+assign initial_srt_en             = ex1_pipedown;
+assign initial_srt_sel_div_in     = ex1_div;
+assign initial_srt_sel_sqrt_in    = ex1_sqrt;
+
+assign srt_first_round            = ex2_srt_first_round;
+
+// &Instance("ct_vfdsu_srt_radix16_with_sqrt"); @436
+ct_vfdsu_srt_radix16_with_sqrt  x_ct_vfdsu_srt_radix16_with_sqrt (
+  .cp0_vfpu_icg_en         (cp0_vfpu_icg_en        ),
+  .cp0_yy_clk_en           (cp0_yy_clk_en          ),
+  .cpurst_b                (cpurst_b               ),
+  .forever_cpuclk          (forever_cpuclk         ),
+  .initial_bound_sel_in    (initial_bound_sel_in   ),
+  .initial_divisor_in      (initial_divisor_in     ),
+  .initial_remainder_in    (initial_remainder_in   ),
+  .initial_srt_en          (initial_srt_en         ),
+  .initial_srt_sel_div_in  (initial_srt_sel_div_in ),
+  .initial_srt_sel_sqrt_in (initial_srt_sel_sqrt_in),
+  .pad_yy_icg_scan_en      (pad_yy_icg_scan_en     ),
+  .srt_first_round         (srt_first_round        ),
+  .srt_remainder           (srt_remainder          ),
+  .srt_remainder_out       (srt_remainder_out      ),
+  .srt_remainder_sign      (srt_remainder_sign     ),
+  .srt_secd_round          (srt_secd_round         ),
+  .srt_sm_on               (srt_sm_on              ),
+  .total_qt_rt             (total_qt_rt            ),
+  .vdiv_qt_rt              (vdiv_qt_rt             )
+);
+
+
+// &Force("bus","ex1_remainder",59,0); @438
+
+assign total_qt_rt_58[57:0] = {total_qt_rt[57:2],2'b00};
+
+// &ModuleEnd; @443
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v
new file mode 100644
index 00000000..097562e3
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_bound_table.v
@@ -0,0 +1,1168 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_srt_radix16_bound_table(
+  bound_sel,
+  digit_bound_1,
+  digit_bound_2,
+  digit_bound_3,
+  digit_bound_4,
+  digit_bound_5,
+  digit_bound_6,
+  digit_bound_7,
+  digit_bound_8,
+  digit_bound_9,
+  sqrt_first_round,
+  sqrt_secd_round,
+  sqrt_secd_round_sign
+);
+
+// &Ports; @23
+input   [6 :0]  bound_sel;           
+input           sqrt_first_round;    
+input           sqrt_secd_round;     
+input           sqrt_secd_round_sign; 
+output  [11:0]  digit_bound_1;       
+output  [11:0]  digit_bound_2;       
+output  [11:0]  digit_bound_3;       
+output  [11:0]  digit_bound_4;       
+output  [11:0]  digit_bound_5;       
+output  [11:0]  digit_bound_6;       
+output  [11:0]  digit_bound_7;       
+output  [11:0]  digit_bound_8;       
+output  [11:0]  digit_bound_9;       
+
+// &Regs; @24
+reg     [11:0]  ori_digit_bound_1;   
+reg     [11:0]  ori_digit_bound_2;   
+reg     [11:0]  ori_digit_bound_3;   
+reg     [11:0]  ori_digit_bound_4;   
+reg     [11:0]  ori_digit_bound_5;   
+reg     [11:0]  ori_digit_bound_6;   
+reg     [11:0]  ori_digit_bound_7;   
+reg     [11:0]  ori_digit_bound_8;   
+reg     [11:0]  ori_digit_bound_9;   
+reg     [11:0]  sqrt_digit_boundm2_1; 
+reg     [11:0]  sqrt_digit_boundm2_2; 
+reg     [11:0]  sqrt_digit_boundm2_3; 
+reg     [11:0]  sqrt_digit_boundm2_4; 
+reg     [11:0]  sqrt_digit_boundm2_5; 
+reg     [11:0]  sqrt_digit_boundm2_6; 
+reg     [11:0]  sqrt_digit_boundm2_7; 
+reg     [11:0]  sqrt_digit_boundm2_8; 
+reg     [11:0]  sqrt_digit_boundm2_9; 
+reg     [11:0]  sqrt_digit_boundp2_1; 
+reg     [11:0]  sqrt_digit_boundp2_2; 
+reg     [11:0]  sqrt_digit_boundp2_3; 
+reg     [11:0]  sqrt_digit_boundp2_4; 
+reg     [11:0]  sqrt_digit_boundp2_5; 
+reg     [11:0]  sqrt_digit_boundp2_6; 
+reg     [11:0]  sqrt_digit_boundp2_7; 
+reg     [11:0]  sqrt_digit_boundp2_8; 
+reg     [11:0]  sqrt_digit_boundp2_9; 
+
+// &Wires; @25
+wire    [6 :0]  bound_sel;           
+wire    [11:0]  digit_bound_1;       
+wire    [11:0]  digit_bound_2;       
+wire    [11:0]  digit_bound_3;       
+wire    [11:0]  digit_bound_4;       
+wire    [11:0]  digit_bound_5;       
+wire    [11:0]  digit_bound_6;       
+wire    [11:0]  digit_bound_7;       
+wire    [11:0]  digit_bound_8;       
+wire    [11:0]  digit_bound_9;       
+wire    [11:0]  sqrt_digit_bound2_1; 
+wire    [11:0]  sqrt_digit_bound2_2; 
+wire    [11:0]  sqrt_digit_bound2_3; 
+wire    [11:0]  sqrt_digit_bound2_4; 
+wire    [11:0]  sqrt_digit_bound2_5; 
+wire    [11:0]  sqrt_digit_bound2_6; 
+wire    [11:0]  sqrt_digit_bound2_7; 
+wire    [11:0]  sqrt_digit_bound2_8; 
+wire    [11:0]  sqrt_digit_bound2_9; 
+wire            sqrt_first_round;    
+wire            sqrt_secd_round;     
+wire            sqrt_secd_round_sign; 
+
+
+// &CombBeg; @27
+always @( bound_sel[6:0])
+begin
+case(bound_sel[6:0])
+  	7'h40:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1b;//27
+		ori_digit_bound_2[11:0] = 12'h5c;//92
+		ori_digit_bound_3[11:0] = 12'h9d;//157
+		ori_digit_bound_4[11:0] = 12'hde;//222
+		ori_digit_bound_5[11:0] = 12'h120;//288
+		ori_digit_bound_6[11:0] = 12'h161;//353
+		ori_digit_bound_7[11:0] = 12'h1a2;//418
+		ori_digit_bound_8[11:0] = 12'h1e3;//483
+		ori_digit_bound_9[11:0] = 12'h225;//549
+
+	end
+	7'h41:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1b;//27
+		ori_digit_bound_2[11:0] = 12'h5d;//93
+		ori_digit_bound_3[11:0] = 12'ha0;//160
+		ori_digit_bound_4[11:0] = 12'he2;//226
+		ori_digit_bound_5[11:0] = 12'h124;//292
+		ori_digit_bound_6[11:0] = 12'h166;//358
+		ori_digit_bound_7[11:0] = 12'h1a9;//425
+		ori_digit_bound_8[11:0] = 12'h1eb;//491
+		ori_digit_bound_9[11:0] = 12'h22d;//557
+
+	end
+	7'h42:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1b;//27
+		ori_digit_bound_2[11:0] = 12'h5f;//95
+		ori_digit_bound_3[11:0] = 12'ha2;//162
+		ori_digit_bound_4[11:0] = 12'he5;//229
+		ori_digit_bound_5[11:0] = 12'h128;//296
+		ori_digit_bound_6[11:0] = 12'h16c;//364
+		ori_digit_bound_7[11:0] = 12'h1af;//431
+		ori_digit_bound_8[11:0] = 12'h1f2;//498
+		ori_digit_bound_9[11:0] = 12'h236;//566
+
+	end
+	7'h43:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1c;//28
+		ori_digit_bound_2[11:0] = 12'h60;//96
+		ori_digit_bound_3[11:0] = 12'ha4;//164
+		ori_digit_bound_4[11:0] = 12'he9;//233
+		ori_digit_bound_5[11:0] = 12'h12d;//301
+		ori_digit_bound_6[11:0] = 12'h171;//369
+		ori_digit_bound_7[11:0] = 12'h1b5;//437
+		ori_digit_bound_8[11:0] = 12'h1fa;//506
+		ori_digit_bound_9[11:0] = 12'h23e;//574
+
+	end
+	7'h44:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1c;//28
+		ori_digit_bound_2[11:0] = 12'h61;//97
+		ori_digit_bound_3[11:0] = 12'ha7;//167
+		ori_digit_bound_4[11:0] = 12'hec;//236
+		ori_digit_bound_5[11:0] = 12'h131;//305
+		ori_digit_bound_6[11:0] = 12'h177;//375
+		ori_digit_bound_7[11:0] = 12'h1bc;//444
+		ori_digit_bound_8[11:0] = 12'h201;//513
+		ori_digit_bound_9[11:0] = 12'h246;//582
+
+	end
+	7'h45:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1d;//29
+		ori_digit_bound_2[11:0] = 12'h63;//99
+		ori_digit_bound_3[11:0] = 12'ha9;//169
+		ori_digit_bound_4[11:0] = 12'hef;//239
+		ori_digit_bound_5[11:0] = 12'h136;//310
+		ori_digit_bound_6[11:0] = 12'h17c;//380
+		ori_digit_bound_7[11:0] = 12'h1c2;//450
+		ori_digit_bound_8[11:0] = 12'h208;//520
+		ori_digit_bound_9[11:0] = 12'h24f;//591
+
+	end
+	7'h46:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1d;//29
+		ori_digit_bound_2[11:0] = 12'h64;//100
+		ori_digit_bound_3[11:0] = 12'hac;//172
+		ori_digit_bound_4[11:0] = 12'hf3;//243
+		ori_digit_bound_5[11:0] = 12'h13a;//314
+		ori_digit_bound_6[11:0] = 12'h181;//385
+		ori_digit_bound_7[11:0] = 12'h1c9;//457
+		ori_digit_bound_8[11:0] = 12'h210;//528
+		ori_digit_bound_9[11:0] = 12'h257;//599
+
+	end
+	7'h47:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1d;//29
+		ori_digit_bound_2[11:0] = 12'h66;//102
+		ori_digit_bound_3[11:0] = 12'hae;//174
+		ori_digit_bound_4[11:0] = 12'hf6;//246
+		ori_digit_bound_5[11:0] = 12'h13e;//318
+		ori_digit_bound_6[11:0] = 12'h187;//391
+		ori_digit_bound_7[11:0] = 12'h1cf;//463
+		ori_digit_bound_8[11:0] = 12'h217;//535
+		ori_digit_bound_9[11:0] = 12'h260;//608
+
+	end
+	7'h48:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1e;//30
+		ori_digit_bound_2[11:0] = 12'h67;//103
+		ori_digit_bound_3[11:0] = 12'hb0;//176
+		ori_digit_bound_4[11:0] = 12'hfa;//250
+		ori_digit_bound_5[11:0] = 12'h143;//323
+		ori_digit_bound_6[11:0] = 12'h18c;//396
+		ori_digit_bound_7[11:0] = 12'h1d5;//469
+		ori_digit_bound_8[11:0] = 12'h21f;//543
+		ori_digit_bound_9[11:0] = 12'h268;//616
+
+	end
+	7'h49:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1e;//30
+		ori_digit_bound_2[11:0] = 12'h68;//104
+		ori_digit_bound_3[11:0] = 12'hb3;//179
+		ori_digit_bound_4[11:0] = 12'hfd;//253
+		ori_digit_bound_5[11:0] = 12'h147;//327
+		ori_digit_bound_6[11:0] = 12'h192;//402
+		ori_digit_bound_7[11:0] = 12'h1dc;//476
+		ori_digit_bound_8[11:0] = 12'h226;//550
+		ori_digit_bound_9[11:0] = 12'h270;//624
+
+	end
+	7'h4a:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1f;//31
+		ori_digit_bound_2[11:0] = 12'h6a;//106
+		ori_digit_bound_3[11:0] = 12'hb5;//181
+		ori_digit_bound_4[11:0] = 12'h100;//256
+		ori_digit_bound_5[11:0] = 12'h14c;//332
+		ori_digit_bound_6[11:0] = 12'h197;//407
+		ori_digit_bound_7[11:0] = 12'h1e2;//482
+		ori_digit_bound_8[11:0] = 12'h22d;//557
+		ori_digit_bound_9[11:0] = 12'h279;//633
+
+	end
+	7'h4b:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1f;//31
+		ori_digit_bound_2[11:0] = 12'h6b;//107
+		ori_digit_bound_3[11:0] = 12'hb8;//184
+		ori_digit_bound_4[11:0] = 12'h104;//260
+		ori_digit_bound_5[11:0] = 12'h150;//336
+		ori_digit_bound_6[11:0] = 12'h19c;//412
+		ori_digit_bound_7[11:0] = 12'h1e9;//489
+		ori_digit_bound_8[11:0] = 12'h235;//565
+		ori_digit_bound_9[11:0] = 12'h281;//641
+
+	end
+	7'h4c:
+	begin
+		ori_digit_bound_1[11:0] = 12'h1f;//31
+		ori_digit_bound_2[11:0] = 12'h6d;//109
+		ori_digit_bound_3[11:0] = 12'hba;//186
+		ori_digit_bound_4[11:0] = 12'h107;//263
+		ori_digit_bound_5[11:0] = 12'h154;//340
+		ori_digit_bound_6[11:0] = 12'h1a2;//418
+		ori_digit_bound_7[11:0] = 12'h1ef;//495
+		ori_digit_bound_8[11:0] = 12'h23c;//572
+		ori_digit_bound_9[11:0] = 12'h28a;//650
+
+	end
+	7'h4d:
+	begin
+		ori_digit_bound_1[11:0] = 12'h20;//32
+		ori_digit_bound_2[11:0] = 12'h6e;//110
+		ori_digit_bound_3[11:0] = 12'hbc;//188
+		ori_digit_bound_4[11:0] = 12'h10b;//267
+		ori_digit_bound_5[11:0] = 12'h159;//345
+		ori_digit_bound_6[11:0] = 12'h1a7;//423
+		ori_digit_bound_7[11:0] = 12'h1f5;//501
+		ori_digit_bound_8[11:0] = 12'h244;//580
+		ori_digit_bound_9[11:0] = 12'h292;//658
+
+	end
+	7'h4e:
+	begin
+		ori_digit_bound_1[11:0] = 12'h20;//32
+		ori_digit_bound_2[11:0] = 12'h6f;//111
+		ori_digit_bound_3[11:0] = 12'hbf;//191
+		ori_digit_bound_4[11:0] = 12'h10e;//270
+		ori_digit_bound_5[11:0] = 12'h15d;//349
+		ori_digit_bound_6[11:0] = 12'h1ad;//429
+		ori_digit_bound_7[11:0] = 12'h1fc;//508
+		ori_digit_bound_8[11:0] = 12'h24b;//587
+		ori_digit_bound_9[11:0] = 12'h29a;//666
+
+	end
+	7'h4f:
+	begin
+		ori_digit_bound_1[11:0] = 12'h21;//33
+		ori_digit_bound_2[11:0] = 12'h71;//113
+		ori_digit_bound_3[11:0] = 12'hc1;//193
+		ori_digit_bound_4[11:0] = 12'h111;//273
+		ori_digit_bound_5[11:0] = 12'h162;//354
+		ori_digit_bound_6[11:0] = 12'h1b2;//434
+		ori_digit_bound_7[11:0] = 12'h202;//514
+		ori_digit_bound_8[11:0] = 12'h252;//594
+		ori_digit_bound_9[11:0] = 12'h2a3;//675
+
+	end
+	7'h50:
+	begin
+		ori_digit_bound_1[11:0] = 12'h21;//33
+		ori_digit_bound_2[11:0] = 12'h72;//114
+		ori_digit_bound_3[11:0] = 12'hc4;//196
+		ori_digit_bound_4[11:0] = 12'h115;//277
+		ori_digit_bound_5[11:0] = 12'h166;//358
+		ori_digit_bound_6[11:0] = 12'h1b7;//439
+		ori_digit_bound_7[11:0] = 12'h209;//521
+		ori_digit_bound_8[11:0] = 12'h25a;//602
+		ori_digit_bound_9[11:0] = 12'h2ab;//683
+
+	end
+	7'h51:
+	begin
+		ori_digit_bound_1[11:0] = 12'h21;//33
+		ori_digit_bound_2[11:0] = 12'h74;//116
+		ori_digit_bound_3[11:0] = 12'hc6;//198
+		ori_digit_bound_4[11:0] = 12'h118;//280
+		ori_digit_bound_5[11:0] = 12'h16a;//362
+		ori_digit_bound_6[11:0] = 12'h1bd;//445
+		ori_digit_bound_7[11:0] = 12'h20f;//527
+		ori_digit_bound_8[11:0] = 12'h261;//609
+		ori_digit_bound_9[11:0] = 12'h2b4;//692
+
+	end
+	7'h52:
+	begin
+		ori_digit_bound_1[11:0] = 12'h22;//34
+		ori_digit_bound_2[11:0] = 12'h75;//117
+		ori_digit_bound_3[11:0] = 12'hc8;//200
+		ori_digit_bound_4[11:0] = 12'h11c;//284
+		ori_digit_bound_5[11:0] = 12'h16f;//367
+		ori_digit_bound_6[11:0] = 12'h1c2;//450
+		ori_digit_bound_7[11:0] = 12'h215;//533
+		ori_digit_bound_8[11:0] = 12'h269;//617
+		ori_digit_bound_9[11:0] = 12'h2bc;//700
+
+	end
+	7'h53:
+	begin
+		ori_digit_bound_1[11:0] = 12'h22;//34
+		ori_digit_bound_2[11:0] = 12'h76;//118
+		ori_digit_bound_3[11:0] = 12'hcb;//203
+		ori_digit_bound_4[11:0] = 12'h11f;//287
+		ori_digit_bound_5[11:0] = 12'h173;//371
+		ori_digit_bound_6[11:0] = 12'h1c8;//456
+		ori_digit_bound_7[11:0] = 12'h21c;//540
+		ori_digit_bound_8[11:0] = 12'h270;//624
+		ori_digit_bound_9[11:0] = 12'h2c4;//708
+
+	end
+	7'h54:
+	begin
+		ori_digit_bound_1[11:0] = 12'h23;//35
+		ori_digit_bound_2[11:0] = 12'h78;//120
+		ori_digit_bound_3[11:0] = 12'hcd;//205
+		ori_digit_bound_4[11:0] = 12'h122;//290
+		ori_digit_bound_5[11:0] = 12'h178;//376
+		ori_digit_bound_6[11:0] = 12'h1cd;//461
+		ori_digit_bound_7[11:0] = 12'h222;//546
+		ori_digit_bound_8[11:0] = 12'h277;//631
+		ori_digit_bound_9[11:0] = 12'h2cd;//717
+
+	end
+	7'h55:
+	begin
+		ori_digit_bound_1[11:0] = 12'h23;//35
+		ori_digit_bound_2[11:0] = 12'h79;//121
+		ori_digit_bound_3[11:0] = 12'hd0;//208
+		ori_digit_bound_4[11:0] = 12'h126;//294
+		ori_digit_bound_5[11:0] = 12'h17c;//380
+		ori_digit_bound_6[11:0] = 12'h1d2;//466
+		ori_digit_bound_7[11:0] = 12'h229;//553
+		ori_digit_bound_8[11:0] = 12'h27f;//639
+		ori_digit_bound_9[11:0] = 12'h2d5;//725
+
+	end
+	7'h56:
+	begin
+		ori_digit_bound_1[11:0] = 12'h23;//35
+		ori_digit_bound_2[11:0] = 12'h7b;//123
+		ori_digit_bound_3[11:0] = 12'hd2;//210
+		ori_digit_bound_4[11:0] = 12'h129;//297
+		ori_digit_bound_5[11:0] = 12'h180;//384
+		ori_digit_bound_6[11:0] = 12'h1d8;//472
+		ori_digit_bound_7[11:0] = 12'h22f;//559
+		ori_digit_bound_8[11:0] = 12'h286;//646
+		ori_digit_bound_9[11:0] = 12'h2de;//734
+
+	end
+	7'h57:
+	begin
+		ori_digit_bound_1[11:0] = 12'h24;//36
+		ori_digit_bound_2[11:0] = 12'h7c;//124
+		ori_digit_bound_3[11:0] = 12'hd4;//212
+		ori_digit_bound_4[11:0] = 12'h12d;//301
+		ori_digit_bound_5[11:0] = 12'h185;//389
+		ori_digit_bound_6[11:0] = 12'h1dd;//477
+		ori_digit_bound_7[11:0] = 12'h235;//565
+		ori_digit_bound_8[11:0] = 12'h28e;//654
+		ori_digit_bound_9[11:0] = 12'h2e6;//742
+
+	end
+	7'h58:
+	begin
+		ori_digit_bound_1[11:0] = 12'h24;//36
+		ori_digit_bound_2[11:0] = 12'h7d;//125
+		ori_digit_bound_3[11:0] = 12'hd7;//215
+		ori_digit_bound_4[11:0] = 12'h130;//304
+		ori_digit_bound_5[11:0] = 12'h189;//393
+		ori_digit_bound_6[11:0] = 12'h1e3;//483
+		ori_digit_bound_7[11:0] = 12'h23c;//572
+		ori_digit_bound_8[11:0] = 12'h295;//661
+		ori_digit_bound_9[11:0] = 12'h2ee;//750
+
+	end
+	7'h59:
+	begin
+		ori_digit_bound_1[11:0] = 12'h25;//37
+		ori_digit_bound_2[11:0] = 12'h7f;//127
+		ori_digit_bound_3[11:0] = 12'hd9;//217
+		ori_digit_bound_4[11:0] = 12'h133;//307
+		ori_digit_bound_5[11:0] = 12'h18e;//398
+		ori_digit_bound_6[11:0] = 12'h1e8;//488
+		ori_digit_bound_7[11:0] = 12'h242;//578
+		ori_digit_bound_8[11:0] = 12'h29c;//668
+		ori_digit_bound_9[11:0] = 12'h2f7;//759
+
+	end
+	7'h5a:
+	begin
+		ori_digit_bound_1[11:0] = 12'h25;//37
+		ori_digit_bound_2[11:0] = 12'h80;//128
+		ori_digit_bound_3[11:0] = 12'hdc;//220
+		ori_digit_bound_4[11:0] = 12'h137;//311
+		ori_digit_bound_5[11:0] = 12'h192;//402
+		ori_digit_bound_6[11:0] = 12'h1ed;//493
+		ori_digit_bound_7[11:0] = 12'h249;//585
+		ori_digit_bound_8[11:0] = 12'h2a4;//676
+		ori_digit_bound_9[11:0] = 12'h2ff;//767
+
+	end
+	7'h5b:
+	begin
+		ori_digit_bound_1[11:0] = 12'h25;//37
+		ori_digit_bound_2[11:0] = 12'h82;//130
+		ori_digit_bound_3[11:0] = 12'hde;//222
+		ori_digit_bound_4[11:0] = 12'h13a;//314
+		ori_digit_bound_5[11:0] = 12'h196;//406
+		ori_digit_bound_6[11:0] = 12'h1f3;//499
+		ori_digit_bound_7[11:0] = 12'h24f;//591
+		ori_digit_bound_8[11:0] = 12'h2ab;//683
+		ori_digit_bound_9[11:0] = 12'h308;//776
+
+	end
+	7'h5c:
+	begin
+		ori_digit_bound_1[11:0] = 12'h26;//38
+		ori_digit_bound_2[11:0] = 12'h83;//131
+		ori_digit_bound_3[11:0] = 12'he0;//224
+		ori_digit_bound_4[11:0] = 12'h13e;//318
+		ori_digit_bound_5[11:0] = 12'h19b;//411
+		ori_digit_bound_6[11:0] = 12'h1f8;//504
+		ori_digit_bound_7[11:0] = 12'h255;//597
+		ori_digit_bound_8[11:0] = 12'h2b3;//691
+		ori_digit_bound_9[11:0] = 12'h310;//784
+
+	end
+	7'h5d:
+	begin
+		ori_digit_bound_1[11:0] = 12'h26;//38
+		ori_digit_bound_2[11:0] = 12'h84;//132
+		ori_digit_bound_3[11:0] = 12'he3;//227
+		ori_digit_bound_4[11:0] = 12'h141;//321
+		ori_digit_bound_5[11:0] = 12'h19f;//415
+		ori_digit_bound_6[11:0] = 12'h1fe;//510
+		ori_digit_bound_7[11:0] = 12'h25c;//604
+		ori_digit_bound_8[11:0] = 12'h2ba;//698
+		ori_digit_bound_9[11:0] = 12'h318;//792
+
+	end
+	7'h5e:
+	begin
+		ori_digit_bound_1[11:0] = 12'h27;//39
+		ori_digit_bound_2[11:0] = 12'h86;//134
+		ori_digit_bound_3[11:0] = 12'he5;//229
+		ori_digit_bound_4[11:0] = 12'h144;//324
+		ori_digit_bound_5[11:0] = 12'h1a4;//420
+		ori_digit_bound_6[11:0] = 12'h203;//515
+		ori_digit_bound_7[11:0] = 12'h262;//610
+		ori_digit_bound_8[11:0] = 12'h2c1;//705
+		ori_digit_bound_9[11:0] = 12'h321;//801
+
+	end
+	7'h5f:
+	begin
+		ori_digit_bound_1[11:0] = 12'h27;//39
+		ori_digit_bound_2[11:0] = 12'h87;//135
+		ori_digit_bound_3[11:0] = 12'he8;//232
+		ori_digit_bound_4[11:0] = 12'h148;//328
+		ori_digit_bound_5[11:0] = 12'h1a8;//424
+		ori_digit_bound_6[11:0] = 12'h208;//520
+		ori_digit_bound_7[11:0] = 12'h269;//617
+		ori_digit_bound_8[11:0] = 12'h2c9;//713
+		ori_digit_bound_9[11:0] = 12'h329;//809
+
+	end
+	7'h60:
+	begin
+		ori_digit_bound_1[11:0] = 12'h27;//39
+		ori_digit_bound_2[11:0] = 12'h89;//137
+		ori_digit_bound_3[11:0] = 12'hea;//234
+		ori_digit_bound_4[11:0] = 12'h14b;//331
+		ori_digit_bound_5[11:0] = 12'h1ac;//428
+		ori_digit_bound_6[11:0] = 12'h20e;//526
+		ori_digit_bound_7[11:0] = 12'h26f;//623
+		ori_digit_bound_8[11:0] = 12'h2d0;//720
+		ori_digit_bound_9[11:0] = 12'h332;//818
+
+	end
+	7'h61:
+	begin
+		ori_digit_bound_1[11:0] = 12'h28;//40
+		ori_digit_bound_2[11:0] = 12'h8a;//138
+		ori_digit_bound_3[11:0] = 12'hec;//236
+		ori_digit_bound_4[11:0] = 12'h14f;//335
+		ori_digit_bound_5[11:0] = 12'h1b1;//433
+		ori_digit_bound_6[11:0] = 12'h213;//531
+		ori_digit_bound_7[11:0] = 12'h275;//629
+		ori_digit_bound_8[11:0] = 12'h2d8;//728
+		ori_digit_bound_9[11:0] = 12'h33a;//826
+
+	end
+	7'h62:
+	begin
+		ori_digit_bound_1[11:0] = 12'h28;//40
+		ori_digit_bound_2[11:0] = 12'h8b;//139
+		ori_digit_bound_3[11:0] = 12'hef;//239
+		ori_digit_bound_4[11:0] = 12'h152;//338
+		ori_digit_bound_5[11:0] = 12'h1b5;//437
+		ori_digit_bound_6[11:0] = 12'h219;//537
+		ori_digit_bound_7[11:0] = 12'h27c;//636
+		ori_digit_bound_8[11:0] = 12'h2df;//735
+		ori_digit_bound_9[11:0] = 12'h342;//834
+
+	end
+	7'h63:
+	begin
+		ori_digit_bound_1[11:0] = 12'h29;//41
+		ori_digit_bound_2[11:0] = 12'h8d;//141
+		ori_digit_bound_3[11:0] = 12'hf1;//241
+		ori_digit_bound_4[11:0] = 12'h155;//341
+		ori_digit_bound_5[11:0] = 12'h1ba;//442
+		ori_digit_bound_6[11:0] = 12'h21e;//542
+		ori_digit_bound_7[11:0] = 12'h282;//642
+		ori_digit_bound_8[11:0] = 12'h2e6;//742
+		ori_digit_bound_9[11:0] = 12'h34b;//843
+
+	end
+	7'h64:
+	begin
+		ori_digit_bound_1[11:0] = 12'h29;//41
+		ori_digit_bound_2[11:0] = 12'h8e;//142
+		ori_digit_bound_3[11:0] = 12'hf4;//244
+		ori_digit_bound_4[11:0] = 12'h159;//345
+		ori_digit_bound_5[11:0] = 12'h1be;//446
+		ori_digit_bound_6[11:0] = 12'h223;//547
+		ori_digit_bound_7[11:0] = 12'h289;//649
+		ori_digit_bound_8[11:0] = 12'h2ee;//750
+		ori_digit_bound_9[11:0] = 12'h353;//851
+
+	end
+	7'h65:
+	begin
+		ori_digit_bound_1[11:0] = 12'h29;//41
+		ori_digit_bound_2[11:0] = 12'h90;//144
+		ori_digit_bound_3[11:0] = 12'hf6;//246
+		ori_digit_bound_4[11:0] = 12'h15c;//348
+		ori_digit_bound_5[11:0] = 12'h1c2;//450
+		ori_digit_bound_6[11:0] = 12'h229;//553
+		ori_digit_bound_7[11:0] = 12'h28f;//655
+		ori_digit_bound_8[11:0] = 12'h2f5;//757
+		ori_digit_bound_9[11:0] = 12'h35c;//860
+
+	end
+	7'h66:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2a;//42
+		ori_digit_bound_2[11:0] = 12'h91;//145
+		ori_digit_bound_3[11:0] = 12'hf8;//248
+		ori_digit_bound_4[11:0] = 12'h160;//352
+		ori_digit_bound_5[11:0] = 12'h1c7;//455
+		ori_digit_bound_6[11:0] = 12'h22e;//558
+		ori_digit_bound_7[11:0] = 12'h295;//661
+		ori_digit_bound_8[11:0] = 12'h2fd;//765
+		ori_digit_bound_9[11:0] = 12'h364;//868
+
+	end
+	7'h67:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2a;//42
+		ori_digit_bound_2[11:0] = 12'h92;//146
+		ori_digit_bound_3[11:0] = 12'hfb;//251
+		ori_digit_bound_4[11:0] = 12'h163;//355
+		ori_digit_bound_5[11:0] = 12'h1cb;//459
+		ori_digit_bound_6[11:0] = 12'h234;//564
+		ori_digit_bound_7[11:0] = 12'h29c;//668
+		ori_digit_bound_8[11:0] = 12'h304;//772
+		ori_digit_bound_9[11:0] = 12'h36c;//876
+
+	end
+	7'h68:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2b;//43
+		ori_digit_bound_2[11:0] = 12'h94;//148
+		ori_digit_bound_3[11:0] = 12'hfd;//253
+		ori_digit_bound_4[11:0] = 12'h166;//358
+		ori_digit_bound_5[11:0] = 12'h1d0;//464
+		ori_digit_bound_6[11:0] = 12'h239;//569
+		ori_digit_bound_7[11:0] = 12'h2a2;//674
+		ori_digit_bound_8[11:0] = 12'h30b;//779
+		ori_digit_bound_9[11:0] = 12'h375;//885
+
+	end
+	7'h69:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2b;//43
+		ori_digit_bound_2[11:0] = 12'h95;//149
+		ori_digit_bound_3[11:0] = 12'h100;//256
+		ori_digit_bound_4[11:0] = 12'h16a;//362
+		ori_digit_bound_5[11:0] = 12'h1d4;//468
+		ori_digit_bound_6[11:0] = 12'h23e;//574
+		ori_digit_bound_7[11:0] = 12'h2a9;//681
+		ori_digit_bound_8[11:0] = 12'h313;//787
+		ori_digit_bound_9[11:0] = 12'h37d;//893
+
+	end
+	7'h6a:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2b;//43
+		ori_digit_bound_2[11:0] = 12'h97;//151
+		ori_digit_bound_3[11:0] = 12'h102;//258
+		ori_digit_bound_4[11:0] = 12'h16d;//365
+		ori_digit_bound_5[11:0] = 12'h1d8;//472
+		ori_digit_bound_6[11:0] = 12'h244;//580
+		ori_digit_bound_7[11:0] = 12'h2af;//687
+		ori_digit_bound_8[11:0] = 12'h31a;//794
+		ori_digit_bound_9[11:0] = 12'h386;//902
+
+	end
+	7'h6b:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2c;//44
+		ori_digit_bound_2[11:0] = 12'h98;//152
+		ori_digit_bound_3[11:0] = 12'h104;//260
+		ori_digit_bound_4[11:0] = 12'h171;//369
+		ori_digit_bound_5[11:0] = 12'h1dd;//477
+		ori_digit_bound_6[11:0] = 12'h249;//585
+		ori_digit_bound_7[11:0] = 12'h2b5;//693
+		ori_digit_bound_8[11:0] = 12'h322;//802
+		ori_digit_bound_9[11:0] = 12'h38e;//910
+
+	end
+	7'h6c:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2c;//44
+		ori_digit_bound_2[11:0] = 12'h99;//153
+		ori_digit_bound_3[11:0] = 12'h107;//263
+		ori_digit_bound_4[11:0] = 12'h174;//372
+		ori_digit_bound_5[11:0] = 12'h1e1;//481
+		ori_digit_bound_6[11:0] = 12'h24f;//591
+		ori_digit_bound_7[11:0] = 12'h2bc;//700
+		ori_digit_bound_8[11:0] = 12'h329;//809
+		ori_digit_bound_9[11:0] = 12'h396;//918
+
+	end
+	7'h6d:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2d;//45
+		ori_digit_bound_2[11:0] = 12'h9b;//155
+		ori_digit_bound_3[11:0] = 12'h109;//265
+		ori_digit_bound_4[11:0] = 12'h177;//375
+		ori_digit_bound_5[11:0] = 12'h1e6;//486
+		ori_digit_bound_6[11:0] = 12'h254;//596
+		ori_digit_bound_7[11:0] = 12'h2c2;//706
+		ori_digit_bound_8[11:0] = 12'h330;//816
+		ori_digit_bound_9[11:0] = 12'h39f;//927
+
+	end
+	7'h6e:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2d;//45
+		ori_digit_bound_2[11:0] = 12'h9c;//156
+		ori_digit_bound_3[11:0] = 12'h10c;//268
+		ori_digit_bound_4[11:0] = 12'h17b;//379
+		ori_digit_bound_5[11:0] = 12'h1ea;//490
+		ori_digit_bound_6[11:0] = 12'h259;//601
+		ori_digit_bound_7[11:0] = 12'h2c9;//713
+		ori_digit_bound_8[11:0] = 12'h338;//824
+		ori_digit_bound_9[11:0] = 12'h3a7;//935
+
+	end
+	7'h6f:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2d;//45
+		ori_digit_bound_2[11:0] = 12'h9e;//158
+		ori_digit_bound_3[11:0] = 12'h10e;//270
+		ori_digit_bound_4[11:0] = 12'h17e;//382
+		ori_digit_bound_5[11:0] = 12'h1ee;//494
+		ori_digit_bound_6[11:0] = 12'h25f;//607
+		ori_digit_bound_7[11:0] = 12'h2cf;//719
+		ori_digit_bound_8[11:0] = 12'h33f;//831
+		ori_digit_bound_9[11:0] = 12'h3b0;//944
+
+	end
+	7'h70:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2e;//46
+		ori_digit_bound_2[11:0] = 12'h9f;//159
+		ori_digit_bound_3[11:0] = 12'h110;//272
+		ori_digit_bound_4[11:0] = 12'h182;//386
+		ori_digit_bound_5[11:0] = 12'h1f3;//499
+		ori_digit_bound_6[11:0] = 12'h264;//612
+		ori_digit_bound_7[11:0] = 12'h2d5;//725
+		ori_digit_bound_8[11:0] = 12'h347;//839
+		ori_digit_bound_9[11:0] = 12'h3b8;//952
+
+	end
+	7'h71:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2e;//46
+		ori_digit_bound_2[11:0] = 12'ha0;//160
+		ori_digit_bound_3[11:0] = 12'h113;//275
+		ori_digit_bound_4[11:0] = 12'h185;//389
+		ori_digit_bound_5[11:0] = 12'h1f7;//503
+		ori_digit_bound_6[11:0] = 12'h26a;//618
+		ori_digit_bound_7[11:0] = 12'h2dc;//732
+		ori_digit_bound_8[11:0] = 12'h34e;//846
+		ori_digit_bound_9[11:0] = 12'h3c0;//960
+
+	end
+	7'h72:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2f;//47
+		ori_digit_bound_2[11:0] = 12'ha2;//162
+		ori_digit_bound_3[11:0] = 12'h115;//277
+		ori_digit_bound_4[11:0] = 12'h188;//392
+		ori_digit_bound_5[11:0] = 12'h1fc;//508
+		ori_digit_bound_6[11:0] = 12'h26f;//623
+		ori_digit_bound_7[11:0] = 12'h2e2;//738
+		ori_digit_bound_8[11:0] = 12'h355;//853
+		ori_digit_bound_9[11:0] = 12'h3c9;//969
+
+	end
+	7'h73:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2f;//47
+		ori_digit_bound_2[11:0] = 12'ha3;//163
+		ori_digit_bound_3[11:0] = 12'h118;//280
+		ori_digit_bound_4[11:0] = 12'h18c;//396
+		ori_digit_bound_5[11:0] = 12'h200;//512
+		ori_digit_bound_6[11:0] = 12'h274;//628
+		ori_digit_bound_7[11:0] = 12'h2e9;//745
+		ori_digit_bound_8[11:0] = 12'h35d;//861
+		ori_digit_bound_9[11:0] = 12'h3d1;//977
+
+	end
+	7'h74:
+	begin
+		ori_digit_bound_1[11:0] = 12'h2f;//47
+		ori_digit_bound_2[11:0] = 12'ha5;//165
+		ori_digit_bound_3[11:0] = 12'h11a;//282
+		ori_digit_bound_4[11:0] = 12'h18f;//399
+		ori_digit_bound_5[11:0] = 12'h204;//516
+		ori_digit_bound_6[11:0] = 12'h27a;//634
+		ori_digit_bound_7[11:0] = 12'h2ef;//751
+		ori_digit_bound_8[11:0] = 12'h364;//868
+		ori_digit_bound_9[11:0] = 12'h3da;//986
+
+	end
+	7'h75:
+	begin
+		ori_digit_bound_1[11:0] = 12'h30;//48
+		ori_digit_bound_2[11:0] = 12'ha6;//166
+		ori_digit_bound_3[11:0] = 12'h11c;//284
+		ori_digit_bound_4[11:0] = 12'h193;//403
+		ori_digit_bound_5[11:0] = 12'h209;//521
+		ori_digit_bound_6[11:0] = 12'h27f;//639
+		ori_digit_bound_7[11:0] = 12'h2f5;//757
+		ori_digit_bound_8[11:0] = 12'h36c;//876
+		ori_digit_bound_9[11:0] = 12'h3e2;//994
+
+	end
+	7'h76:
+	begin
+		ori_digit_bound_1[11:0] = 12'h30;//48
+		ori_digit_bound_2[11:0] = 12'ha7;//167
+		ori_digit_bound_3[11:0] = 12'h11f;//287
+		ori_digit_bound_4[11:0] = 12'h196;//406
+		ori_digit_bound_5[11:0] = 12'h20d;//525
+		ori_digit_bound_6[11:0] = 12'h285;//645
+		ori_digit_bound_7[11:0] = 12'h2fc;//764
+		ori_digit_bound_8[11:0] = 12'h373;//883
+		ori_digit_bound_9[11:0] = 12'h3ea;//1002
+
+	end
+	7'h77:
+	begin
+		ori_digit_bound_1[11:0] = 12'h31;//49
+		ori_digit_bound_2[11:0] = 12'ha9;//169
+		ori_digit_bound_3[11:0] = 12'h121;//289
+		ori_digit_bound_4[11:0] = 12'h199;//409
+		ori_digit_bound_5[11:0] = 12'h212;//530
+		ori_digit_bound_6[11:0] = 12'h28a;//650
+		ori_digit_bound_7[11:0] = 12'h302;//770
+		ori_digit_bound_8[11:0] = 12'h37a;//890
+		ori_digit_bound_9[11:0] = 12'h3f3;//1011
+
+	end
+	7'h78:
+	begin
+		ori_digit_bound_1[11:0] = 12'h31;//49
+		ori_digit_bound_2[11:0] = 12'haa;//170
+		ori_digit_bound_3[11:0] = 12'h124;//292
+		ori_digit_bound_4[11:0] = 12'h19d;//413
+		ori_digit_bound_5[11:0] = 12'h216;//534
+		ori_digit_bound_6[11:0] = 12'h28f;//655
+		ori_digit_bound_7[11:0] = 12'h309;//777
+		ori_digit_bound_8[11:0] = 12'h382;//898
+		ori_digit_bound_9[11:0] = 12'h3fb;//1019
+
+	end
+	7'h79:
+	begin
+		ori_digit_bound_1[11:0] = 12'h31;//49
+		ori_digit_bound_2[11:0] = 12'hac;//172
+		ori_digit_bound_3[11:0] = 12'h126;//294
+		ori_digit_bound_4[11:0] = 12'h1a0;//416
+		ori_digit_bound_5[11:0] = 12'h21a;//538
+		ori_digit_bound_6[11:0] = 12'h295;//661
+		ori_digit_bound_7[11:0] = 12'h30f;//783
+		ori_digit_bound_8[11:0] = 12'h389;//905
+		ori_digit_bound_9[11:0] = 12'h404;//1028
+
+	end
+	7'h7a:
+	begin
+		ori_digit_bound_1[11:0] = 12'h32;//50
+		ori_digit_bound_2[11:0] = 12'had;//173
+		ori_digit_bound_3[11:0] = 12'h128;//296
+		ori_digit_bound_4[11:0] = 12'h1a4;//420
+		ori_digit_bound_5[11:0] = 12'h21f;//543
+		ori_digit_bound_6[11:0] = 12'h29a;//666
+		ori_digit_bound_7[11:0] = 12'h315;//789
+		ori_digit_bound_8[11:0] = 12'h391;//913
+		ori_digit_bound_9[11:0] = 12'h40c;//1036
+
+	end
+	7'h7b:
+	begin
+		ori_digit_bound_1[11:0] = 12'h32;//50
+		ori_digit_bound_2[11:0] = 12'hae;//174
+		ori_digit_bound_3[11:0] = 12'h12b;//299
+		ori_digit_bound_4[11:0] = 12'h1a7;//423
+		ori_digit_bound_5[11:0] = 12'h223;//547
+		ori_digit_bound_6[11:0] = 12'h2a0;//672
+		ori_digit_bound_7[11:0] = 12'h31c;//796
+		ori_digit_bound_8[11:0] = 12'h398;//920
+		ori_digit_bound_9[11:0] = 12'h414;//1044
+
+	end
+	7'h7c:
+	begin
+		ori_digit_bound_1[11:0] = 12'h33;//51
+		ori_digit_bound_2[11:0] = 12'hb0;//176
+		ori_digit_bound_3[11:0] = 12'h12d;//301
+		ori_digit_bound_4[11:0] = 12'h1aa;//426
+		ori_digit_bound_5[11:0] = 12'h228;//552
+		ori_digit_bound_6[11:0] = 12'h2a5;//677
+		ori_digit_bound_7[11:0] = 12'h322;//802
+		ori_digit_bound_8[11:0] = 12'h39f;//927
+		ori_digit_bound_9[11:0] = 12'h41d;//1053
+
+	end
+	7'h7d:
+	begin
+		ori_digit_bound_1[11:0] = 12'h33;//51
+		ori_digit_bound_2[11:0] = 12'hb1;//177
+		ori_digit_bound_3[11:0] = 12'h130;//304
+		ori_digit_bound_4[11:0] = 12'h1ae;//430
+		ori_digit_bound_5[11:0] = 12'h22c;//556
+		ori_digit_bound_6[11:0] = 12'h2aa;//682
+		ori_digit_bound_7[11:0] = 12'h329;//809
+		ori_digit_bound_8[11:0] = 12'h3a7;//935
+		ori_digit_bound_9[11:0] = 12'h425;//1061
+
+	end
+	7'h7e:
+	begin
+		ori_digit_bound_1[11:0] = 12'h33;//51
+		ori_digit_bound_2[11:0] = 12'hb3;//179
+		ori_digit_bound_3[11:0] = 12'h132;//306
+		ori_digit_bound_4[11:0] = 12'h1b1;//433
+		ori_digit_bound_5[11:0] = 12'h230;//560
+		ori_digit_bound_6[11:0] = 12'h2b0;//688
+		ori_digit_bound_7[11:0] = 12'h32f;//815
+		ori_digit_bound_8[11:0] = 12'h3ae;//942
+		ori_digit_bound_9[11:0] = 12'h42e;//1070
+
+	end
+	7'h7f:
+	begin
+		ori_digit_bound_1[11:0] = 12'h34;//52
+		ori_digit_bound_2[11:0] = 12'hb4;//180
+		ori_digit_bound_3[11:0] = 12'h134;//308
+		ori_digit_bound_4[11:0] = 12'h1b5;//437
+		ori_digit_bound_5[11:0] = 12'h235;//565
+		ori_digit_bound_6[11:0] = 12'h2b5;//693
+		ori_digit_bound_7[11:0] = 12'h335;//821
+		ori_digit_bound_8[11:0] = 12'h3b6;//950
+		ori_digit_bound_9[11:0] = 12'h436;//1078
+
+	end
+	7'h00:
+	begin
+		ori_digit_bound_1[11:0] = 12'h34;//52
+		ori_digit_bound_2[11:0] = 12'hb5;//181
+		ori_digit_bound_3[11:0] = 12'h137;//311
+		ori_digit_bound_4[11:0] = 12'h1b8;//440
+		ori_digit_bound_5[11:0] = 12'h239;//569
+		ori_digit_bound_6[11:0] = 12'h2bb;//699
+		ori_digit_bound_7[11:0] = 12'h33c;//828
+		ori_digit_bound_8[11:0] = 12'h3bd;//957
+		ori_digit_bound_9[11:0] = 12'h43e;//1086
+	end
+  default:
+  begin
+		ori_digit_bound_1[11:0] = {12{1'bx}};
+		ori_digit_bound_2[11:0] = {12{1'bx}};
+		ori_digit_bound_3[11:0] = {12{1'bx}};
+		ori_digit_bound_4[11:0] = {12{1'bx}};
+		ori_digit_bound_5[11:0] = {12{1'bx}};
+		ori_digit_bound_6[11:0] = {12{1'bx}};
+		ori_digit_bound_7[11:0] = {12{1'bx}};
+		ori_digit_bound_8[11:0] = {12{1'bx}};
+		ori_digit_bound_9[11:0] = {12{1'bx}};
+	end
+endcase
+// &CombEnd; @886
+end
+// &CombBeg; @887
+always @( bound_sel[6:0])
+begin
+case(bound_sel[6:0])
+	7'h40:
+	begin
+		sqrt_digit_boundp2_1[11:0] = 12'h21;//33
+		sqrt_digit_boundp2_2[11:0] = 12'h62;//98
+		sqrt_digit_boundp2_3[11:0] = 12'ha4;//164
+		sqrt_digit_boundp2_4[11:0] = 12'he7;//231
+		sqrt_digit_boundp2_5[11:0] = 12'h12b;//299
+		sqrt_digit_boundp2_6[11:0] = 12'h170;//368
+		sqrt_digit_boundp2_7[11:0] = 12'h1b6;//438
+		sqrt_digit_boundp2_8[11:0] = 12'h1fd;//509
+		sqrt_digit_boundp2_9[11:0] = 12'h245;//581
+	end
+	7'h50:
+	begin
+		sqrt_digit_boundp2_1[11:0] = 12'h29;//41
+		sqrt_digit_boundp2_2[11:0] = 12'h7a;//122
+		sqrt_digit_boundp2_3[11:0] = 12'hcc;//204
+		sqrt_digit_boundp2_4[11:0] = 12'h11f;//287
+		sqrt_digit_boundp2_5[11:0] = 12'h173;//371
+		sqrt_digit_boundp2_6[11:0] = 12'h1c8;//456
+		sqrt_digit_boundp2_7[11:0] = 12'h21e;//542
+		sqrt_digit_boundp2_8[11:0] = 12'h275;//629
+		sqrt_digit_boundp2_9[11:0] = 12'h2cd;//717
+	end
+	7'h60:
+	begin
+		sqrt_digit_boundp2_1[11:0] = 12'h31;//49
+		sqrt_digit_boundp2_2[11:0] = 12'h92;//146
+		sqrt_digit_boundp2_3[11:0] = 12'hf4;//244
+		sqrt_digit_boundp2_4[11:0] = 12'h157;//343
+		sqrt_digit_boundp2_5[11:0] = 12'h1bb;//443
+		sqrt_digit_boundp2_6[11:0] = 12'h220;//544
+		sqrt_digit_boundp2_7[11:0] = 12'h286;//646
+		sqrt_digit_boundp2_8[11:0] = 12'h2ed;//749
+		sqrt_digit_boundp2_9[11:0] = 12'h355;//853
+	end
+	7'h70:
+	begin
+		sqrt_digit_boundp2_1[11:0] = 12'h39;//57
+		sqrt_digit_boundp2_2[11:0] = 12'haa;//170
+		sqrt_digit_boundp2_3[11:0] = 12'h11c;//284
+		sqrt_digit_boundp2_4[11:0] = 12'h18f;//399
+		sqrt_digit_boundp2_5[11:0] = 12'h203;//515
+		sqrt_digit_boundp2_6[11:0] = 12'h278;//632
+		sqrt_digit_boundp2_7[11:0] = 12'h2ee;//750
+		sqrt_digit_boundp2_8[11:0] = 12'h365;//869
+		sqrt_digit_boundp2_9[11:0] = 12'h3dd;//989
+	end
+	7'h00:
+	begin
+		sqrt_digit_boundp2_1[11:0] = 12'h41;//65
+		sqrt_digit_boundp2_2[11:0] = 12'hc2;//194
+		sqrt_digit_boundp2_3[11:0] = 12'h144;//324
+		sqrt_digit_boundp2_4[11:0] = 12'h1c7;//455
+		sqrt_digit_boundp2_5[11:0] = 12'h24b;//587
+		sqrt_digit_boundp2_6[11:0] = 12'h2d0;//720
+		sqrt_digit_boundp2_7[11:0] = 12'h356;//854
+		sqrt_digit_boundp2_8[11:0] = 12'h3dd;//989
+		sqrt_digit_boundp2_9[11:0] = 12'h465;//1125
+	end
+  default:
+  begin
+		sqrt_digit_boundp2_1[11:0] = {12{1'bx}};//-66
+		sqrt_digit_boundp2_2[11:0] = {12{1'bx}};//-190
+		sqrt_digit_boundp2_3[11:0] = {12{1'bx}};//-324
+		sqrt_digit_boundp2_4[11:0] = {12{1'bx}};//-450
+		sqrt_digit_boundp2_5[11:0] = {12{1'bx}};//-588
+		sqrt_digit_boundp2_6[11:0] = {12{1'bx}};//-720
+		sqrt_digit_boundp2_7[11:0] = {12{1'bx}};//-852
+		sqrt_digit_boundp2_8[11:0] = {12{1'bx}};//-988
+		sqrt_digit_boundp2_9[11:0] = {12{1'bx}};//-1120
+	end
+
+
+endcase
+// &CombEnd; @964
+end
+// &CombBeg; @965
+always @( bound_sel[6:0])
+begin
+case(bound_sel[6:0])
+  7'h40:
+	begin
+		sqrt_digit_boundm2_1[11:0] = 12'h20;//32
+		sqrt_digit_boundm2_2[11:0] = 12'h5f;//95
+		sqrt_digit_boundm2_3[11:0] = 12'h9d;//157
+		sqrt_digit_boundm2_4[11:0] = 12'hda;//218
+		sqrt_digit_boundm2_5[11:0] = 12'h116;//278
+		sqrt_digit_boundm2_6[11:0] = 12'h151;//337
+		sqrt_digit_boundm2_7[11:0] = 12'h18b;//395
+		sqrt_digit_boundm2_8[11:0] = 12'h1c4;//452
+		sqrt_digit_boundm2_9[11:0] = 12'h1fc;//508
+	end
+	7'h50:
+	begin
+		sqrt_digit_boundm2_1[11:0] = 12'h28;//40
+		sqrt_digit_boundm2_2[11:0] = 12'h77;//119
+		sqrt_digit_boundm2_3[11:0] = 12'hc5;//197
+		sqrt_digit_boundm2_4[11:0] = 12'h112;//274
+		sqrt_digit_boundm2_5[11:0] = 12'h15e;//350
+		sqrt_digit_boundm2_6[11:0] = 12'h1a9;//425
+		sqrt_digit_boundm2_7[11:0] = 12'h1f3;//499
+		sqrt_digit_boundm2_8[11:0] = 12'h23c;//572
+		sqrt_digit_boundm2_9[11:0] = 12'h284;//644
+	end
+	7'h60:
+	begin
+		sqrt_digit_boundm2_1[11:0] = 12'h30;//48
+		sqrt_digit_boundm2_2[11:0] = 12'h8f;//143
+		sqrt_digit_boundm2_3[11:0] = 12'hed;//237
+		sqrt_digit_boundm2_4[11:0] = 12'h14a;//330
+		sqrt_digit_boundm2_5[11:0] = 12'h1a6;//422
+		sqrt_digit_boundm2_6[11:0] = 12'h201;//513
+		sqrt_digit_boundm2_7[11:0] = 12'h25b;//603
+		sqrt_digit_boundm2_8[11:0] = 12'h2b4;//692
+		sqrt_digit_boundm2_9[11:0] = 12'h30c;//780
+	end
+	7'h70:
+	begin
+		sqrt_digit_boundm2_1[11:0] = 12'h38;//56
+		sqrt_digit_boundm2_2[11:0] = 12'ha7;//167
+		sqrt_digit_boundm2_3[11:0] = 12'h115;//277
+		sqrt_digit_boundm2_4[11:0] = 12'h182;//386
+		sqrt_digit_boundm2_5[11:0] = 12'h1ee;//494
+		sqrt_digit_boundm2_6[11:0] = 12'h259;//601
+		sqrt_digit_boundm2_7[11:0] = 12'h2c3;//707
+		sqrt_digit_boundm2_8[11:0] = 12'h32c;//812
+		sqrt_digit_boundm2_9[11:0] = 12'h394;//916
+	end
+	7'h00:
+	begin
+		sqrt_digit_boundm2_1[11:0] = 12'h40;//64
+		sqrt_digit_boundm2_2[11:0] = 12'hbf;//191
+		sqrt_digit_boundm2_3[11:0] = 12'h13d;//317
+		sqrt_digit_boundm2_4[11:0] = 12'h1ba;//442
+		sqrt_digit_boundm2_5[11:0] = 12'h236;//566
+		sqrt_digit_boundm2_6[11:0] = 12'h2b1;//689
+		sqrt_digit_boundm2_7[11:0] = 12'h32b;//811
+		sqrt_digit_boundm2_8[11:0] = 12'h3a4;//932
+		sqrt_digit_boundm2_9[11:0] = 12'h41c;//1052
+	end
+
+  default:
+  begin
+		sqrt_digit_boundm2_1[11:0] = {12{1'bx}};//-66
+		sqrt_digit_boundm2_2[11:0] = {12{1'bx}};//-190
+		sqrt_digit_boundm2_3[11:0] = {12{1'bx}};//-324
+		sqrt_digit_boundm2_4[11:0] = {12{1'bx}};//-450
+		sqrt_digit_boundm2_5[11:0] = {12{1'bx}};//-588
+		sqrt_digit_boundm2_6[11:0] = {12{1'bx}};//-720
+		sqrt_digit_boundm2_7[11:0] = {12{1'bx}};//-852
+		sqrt_digit_boundm2_8[11:0] = {12{1'bx}};//-988
+		sqrt_digit_boundm2_9[11:0] = {12{1'bx}};//-1120
+	end
+endcase
+// &CombEnd; @1041
+end
+assign sqrt_digit_bound2_1[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_1[11:0] : sqrt_digit_boundp2_1[11:0];
+assign sqrt_digit_bound2_2[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_2[11:0] : sqrt_digit_boundp2_2[11:0];
+assign sqrt_digit_bound2_3[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_3[11:0] : sqrt_digit_boundp2_3[11:0];
+assign sqrt_digit_bound2_4[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_4[11:0] : sqrt_digit_boundp2_4[11:0];
+assign sqrt_digit_bound2_5[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_5[11:0] : sqrt_digit_boundp2_5[11:0];
+assign sqrt_digit_bound2_6[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_6[11:0] : sqrt_digit_boundp2_6[11:0];
+assign sqrt_digit_bound2_7[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_7[11:0] : sqrt_digit_boundp2_7[11:0];
+assign sqrt_digit_bound2_8[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_8[11:0] : sqrt_digit_boundp2_8[11:0];
+assign sqrt_digit_bound2_9[11:0] = sqrt_secd_round_sign ? sqrt_digit_boundm2_9[11:0] : sqrt_digit_boundp2_9[11:0];
+                                                
+
+assign digit_bound_1[11:0] = sqrt_first_round ? 12'h2 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_1[11:0] 
+                                                                 : ori_digit_bound_1[11:0]; //-2
+assign digit_bound_2[11:0] = sqrt_first_round ? 12'h10 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_2[11:0] 
+                                                                : ori_digit_bound_2[11:0]; //-16
+assign digit_bound_3[11:0] = sqrt_first_round ? 12'h35 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_3[11:0] 
+                                                                : ori_digit_bound_3[11:0]; //-53
+assign digit_bound_4[11:0] = sqrt_first_round ? 12'h5f : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_4[11:0] 
+                                                                : ori_digit_bound_4[11:0]; //-95
+assign digit_bound_5[11:0] = sqrt_first_round ? 12'ha0 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_5[11:0] 
+                                                                : ori_digit_bound_5[11:0]; //-160
+assign digit_bound_6[11:0] = sqrt_first_round ? 12'hf0 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_6[11:0] 
+                                                                : ori_digit_bound_6[11:0];    //-240
+assign digit_bound_7[11:0] = sqrt_first_round ? 12'h14f : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_7[11:0] 
+                                                                : ori_digit_bound_7[11:0]; //-335
+assign digit_bound_8[11:0] = sqrt_first_round ? 12'h1c2 : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_8[11:0] 
+                                                                : ori_digit_bound_8[11:0]; //-450
+assign digit_bound_9[11:0] = sqrt_first_round ? 12'h23a : 
+                                                sqrt_secd_round ? sqrt_digit_bound2_9[11:0] 
+                                                                : ori_digit_bound_9[11:0]; //-570
+// &ModuleEnd; @1080
+endmodule
+
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v
new file mode 100644
index 00000000..77a95ae9
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt_radix16_with_sqrt.v
@@ -0,0 +1,1152 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &ModuleBeg; @22
+module ct_vfdsu_srt_radix16_with_sqrt(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  forever_cpuclk,
+  initial_bound_sel_in,
+  initial_divisor_in,
+  initial_remainder_in,
+  initial_srt_en,
+  initial_srt_sel_div_in,
+  initial_srt_sel_sqrt_in,
+  pad_yy_icg_scan_en,
+  srt_first_round,
+  srt_remainder,
+  srt_remainder_out,
+  srt_remainder_sign,
+  srt_secd_round,
+  srt_sm_on,
+  total_qt_rt,
+  vdiv_qt_rt
+);
+
+// &Ports; @23
+input           cp0_vfpu_icg_en;           
+input           cp0_yy_clk_en;             
+input           cpurst_b;                  
+input           forever_cpuclk;            
+input   [6 :0]  initial_bound_sel_in;      
+input   [55:0]  initial_divisor_in;        
+input   [60:0]  initial_remainder_in;      
+input           initial_srt_en;            
+input           initial_srt_sel_div_in;    
+input           initial_srt_sel_sqrt_in;   
+input           pad_yy_icg_scan_en;        
+input           srt_first_round;           
+input           srt_secd_round;            
+input           srt_sm_on;                 
+output  [60:0]  srt_remainder;             
+output  [59:0]  srt_remainder_out;         
+output          srt_remainder_sign;        
+output  [57:0]  total_qt_rt;               
+output  [57:0]  vdiv_qt_rt;                
+
+// &Regs; @24
+reg     [6 :0]  bound_sel;                 
+reg     [60:0]  cur_rem;                   
+reg     [57:0]  qt_rt_const_shift_std;     
+reg     [55:0]  srt_divisor;               
+reg     [60:0]  srt_remainder;             
+reg     [60:0]  srt_remainder_minus;       
+reg     [60:0]  srt_remainder_minus_nxt;   
+reg             srt_sel_div;               
+reg             srt_sel_sqrt;              
+reg     [57:0]  total_qt_rt;               
+reg     [57:0]  total_qt_rt_minus;         
+reg     [57:0]  total_qt_rt_minus_next;    
+reg     [57:0]  total_qt_rt_next;          
+
+// &Wires; @25
+wire            bound1_cmp_sign;           
+wire            bound2_cmp_sign;           
+wire            bound3_cmp_sign;           
+wire            bound4_cmp_sign;           
+wire            bound5_cmp_sign;           
+wire            bound6_cmp_sign;           
+wire            bound7_cmp_sign;           
+wire            bound8_cmp_sign;           
+wire            bound9_cmp_sign;           
+wire    [8 :0]  bound_cmp_sign;            
+wire            cp0_vfpu_icg_en;           
+wire            cp0_yy_clk_en;             
+wire            cpurst_b;                  
+wire    [60:0]  cur_rem_1;                 
+wire    [60:0]  cur_rem_2;                 
+wire    [60:0]  cur_rem_3;                 
+wire    [60:0]  cur_rem_4;                 
+wire    [60:0]  cur_rem_5;                 
+wire    [60:0]  cur_rem_6;                 
+wire    [60:0]  cur_rem_7;                 
+wire    [60:0]  cur_rem_8;                 
+wire    [60:0]  cur_rem_9;                 
+wire    [11:0]  digit_bound_1;             
+wire    [11:0]  digit_bound_2;             
+wire    [11:0]  digit_bound_3;             
+wire    [11:0]  digit_bound_4;             
+wire    [11:0]  digit_bound_5;             
+wire    [11:0]  digit_bound_6;             
+wire    [11:0]  digit_bound_7;             
+wire    [11:0]  digit_bound_8;             
+wire    [11:0]  digit_bound_9;             
+wire    [60:0]  div_qt_1_rem_add_op1;      
+wire    [60:0]  div_qt_2_rem_add_op1;      
+wire    [60:0]  div_qt_3_rem_add_op1_0;    
+wire    [60:0]  div_qt_3_rem_add_op1_1;    
+wire    [60:0]  div_qt_4_rem_add_op1;      
+wire    [60:0]  div_qt_5_rem_add_op1_0;    
+wire    [60:0]  div_qt_5_rem_add_op1_1;    
+wire    [60:0]  div_qt_6_rem_add_op1_0;    
+wire    [60:0]  div_qt_6_rem_add_op1_1;    
+wire    [60:0]  div_qt_7_rem_add_op1_0;    
+wire    [60:0]  div_qt_7_rem_add_op1_1;    
+wire    [60:0]  div_qt_8_rem_add_op1;      
+wire    [60:0]  div_qt_9_rem_add_op1_0;    
+wire    [60:0]  div_qt_9_rem_add_op1_1;    
+wire    [60:0]  div_qt_r1_rem_add_op1;     
+wire    [60:0]  div_qt_r2_rem_add_op1;     
+wire    [60:0]  div_qt_r3_rem_add_op1_0;   
+wire    [60:0]  div_qt_r3_rem_add_op1_1;   
+wire    [60:0]  div_qt_r4_rem_add_op1;     
+wire    [60:0]  div_qt_r5_rem_add_op1_0;   
+wire    [60:0]  div_qt_r5_rem_add_op1_1;   
+wire    [60:0]  div_qt_r6_rem_add_op1_0;   
+wire    [60:0]  div_qt_r6_rem_add_op1_1;   
+wire    [60:0]  div_qt_r7_rem_add_op1_0;   
+wire    [60:0]  div_qt_r7_rem_add_op1_1;   
+wire    [60:0]  div_qt_r8_rem_add_op1;     
+wire    [60:0]  div_qt_r9_rem_add_op1_0;   
+wire    [60:0]  div_qt_r9_rem_add_op1_1;   
+wire    [60:0]  div_rem_add1_op1;          
+wire    [60:0]  div_rem_add2_op1;          
+wire    [60:0]  div_rem_add3_op1_0;        
+wire    [60:0]  div_rem_add3_op1_1;        
+wire    [60:0]  div_rem_add4_op1;          
+wire    [60:0]  div_rem_add5_op1_0;        
+wire    [60:0]  div_rem_add5_op1_1;        
+wire    [60:0]  div_rem_add6_op1_0;        
+wire    [60:0]  div_rem_add6_op1_1;        
+wire    [60:0]  div_rem_add7_op1_0;        
+wire    [60:0]  div_rem_add7_op1_1;        
+wire    [60:0]  div_rem_add8_op1;          
+wire    [60:0]  div_rem_add9_op1_0;        
+wire    [60:0]  div_rem_add9_op1_1;        
+wire            forever_cpuclk;            
+wire    [6 :0]  initial_bound_sel_in;      
+wire    [55:0]  initial_divisor_in;        
+wire    [60:0]  initial_remainder_in;      
+wire            initial_srt_en;            
+wire            initial_srt_sel_div_in;    
+wire            initial_srt_sel_sqrt_in;   
+wire            pad_yy_icg_scan_en;        
+wire    [11:0]  part_rem;                  
+wire    [62:0]  qt_rt_const_q1;            
+wire    [62:0]  qt_rt_const_q10;           
+wire    [62:0]  qt_rt_const_q11;           
+wire    [62:0]  qt_rt_const_q112;          
+wire    [62:0]  qt_rt_const_q12;           
+wire    [62:0]  qt_rt_const_q128;          
+wire    [62:0]  qt_rt_const_q13;           
+wire    [62:0]  qt_rt_const_q14;           
+wire    [62:0]  qt_rt_const_q15;           
+wire    [62:0]  qt_rt_const_q16;           
+wire    [62:0]  qt_rt_const_q17;           
+wire    [62:0]  qt_rt_const_q192;          
+wire    [62:0]  qt_rt_const_q2;            
+wire    [62:0]  qt_rt_const_q23;           
+wire    [62:0]  qt_rt_const_q24;           
+wire    [62:0]  qt_rt_const_q27;           
+wire    [62:0]  qt_rt_const_q3;            
+wire    [62:0]  qt_rt_const_q31;           
+wire    [62:0]  qt_rt_const_q32;           
+wire    [62:0]  qt_rt_const_q4;            
+wire    [62:0]  qt_rt_const_q44;           
+wire    [62:0]  qt_rt_const_q5;            
+wire    [62:0]  qt_rt_const_q56;           
+wire    [62:0]  qt_rt_const_q6;            
+wire    [62:0]  qt_rt_const_q60;           
+wire    [62:0]  qt_rt_const_q64;           
+wire    [62:0]  qt_rt_const_q7;            
+wire    [62:0]  qt_rt_const_q8;            
+wire    [62:0]  qt_rt_const_q80;           
+wire    [62:0]  qt_rt_const_q9;            
+wire    [57:0]  qt_rt_const_shift_std_next; 
+wire    [60:0]  rem_add1_op1;              
+wire    [60:0]  rem_add2_op1;              
+wire    [60:0]  rem_add3_op1_0;            
+wire    [60:0]  rem_add3_op1_1;            
+wire    [60:0]  rem_add4_op1;              
+wire    [60:0]  rem_add5_op1_0;            
+wire    [60:0]  rem_add5_op1_1;            
+wire    [60:0]  rem_add6_op1_0;            
+wire    [60:0]  rem_add6_op1_1;            
+wire    [60:0]  rem_add7_op1_0;            
+wire    [60:0]  rem_add7_op1_1;            
+wire    [60:0]  rem_add8_op1;              
+wire    [60:0]  rem_add9_op1_0;            
+wire    [60:0]  rem_add9_op1_1;            
+wire    [60:0]  rem_minus_minus_6;         
+wire            rem_sign;                  
+wire    [60:0]  remainder_minus_nor_nxt_0; 
+wire    [60:0]  remainder_minus_nor_nxt_1; 
+wire    [60:0]  remainder_minus_nor_nxt_2; 
+wire    [60:0]  remainder_minus_nor_nxt_3; 
+wire    [60:0]  remainder_minus_nor_nxt_4; 
+wire    [60:0]  remainder_minus_nor_nxt_5; 
+wire    [60:0]  remainder_minus_nor_nxt_6; 
+wire    [60:0]  remainder_minus_nor_nxt_7; 
+wire    [60:0]  remainder_minus_nor_nxt_8; 
+wire    [60:0]  remainder_minus_nor_nxt_9; 
+wire    [60:0]  remainder_minus_shift;     
+wire    [60:0]  remainder_shift;           
+wire            sqrt_first_round;          
+wire    [60:0]  sqrt_qt_1_rem_add_op1;     
+wire    [60:0]  sqrt_qt_2_rem_add_op1;     
+wire    [60:0]  sqrt_qt_3_rem_add_op1_0;   
+wire    [60:0]  sqrt_qt_3_rem_add_op1_1;   
+wire    [60:0]  sqrt_qt_4_rem_add_op1;     
+wire    [60:0]  sqrt_qt_5_rem_add_op1_0;   
+wire    [60:0]  sqrt_qt_5_rem_add_op1_1;   
+wire    [60:0]  sqrt_qt_6_rem_add_op1_0;   
+wire    [60:0]  sqrt_qt_6_rem_add_op1_1;   
+wire    [60:0]  sqrt_qt_7_rem_add_op1_0;   
+wire    [60:0]  sqrt_qt_7_rem_add_op1_1;   
+wire    [60:0]  sqrt_qt_8_rem_add_op1;     
+wire    [60:0]  sqrt_qt_9_rem_add_op1_0;   
+wire    [60:0]  sqrt_qt_9_rem_add_op1_1;   
+wire    [60:0]  sqrt_qt_r1_rem_add_op1;    
+wire    [60:0]  sqrt_qt_r2_rem_add_op1;    
+wire    [60:0]  sqrt_qt_r3_rem_add_op1_0;  
+wire    [60:0]  sqrt_qt_r3_rem_add_op1_1;  
+wire    [60:0]  sqrt_qt_r4_rem_add_op1;    
+wire    [60:0]  sqrt_qt_r5_rem_add_op1_0;  
+wire    [60:0]  sqrt_qt_r5_rem_add_op1_1;  
+wire    [60:0]  sqrt_qt_r6_rem_add_op1_0;  
+wire    [60:0]  sqrt_qt_r6_rem_add_op1_1;  
+wire    [60:0]  sqrt_qt_r7_rem_add_op1_0;  
+wire    [60:0]  sqrt_qt_r7_rem_add_op1_1;  
+wire    [60:0]  sqrt_qt_r8_rem_add_op1;    
+wire    [60:0]  sqrt_qt_r9_rem_add_op1_0;  
+wire    [60:0]  sqrt_qt_r9_rem_add_op1_1;  
+wire    [60:0]  sqrt_rem_add1_op1;         
+wire    [60:0]  sqrt_rem_add2_op1;         
+wire    [60:0]  sqrt_rem_add3_op1_0;       
+wire    [60:0]  sqrt_rem_add3_op1_1;       
+wire    [60:0]  sqrt_rem_add4_op1;         
+wire    [60:0]  sqrt_rem_add5_op1_0;       
+wire    [60:0]  sqrt_rem_add5_op1_1;       
+wire    [60:0]  sqrt_rem_add6_op1_0;       
+wire    [60:0]  sqrt_rem_add6_op1_1;       
+wire    [60:0]  sqrt_rem_add7_op1_0;       
+wire    [60:0]  sqrt_rem_add7_op1_1;       
+wire    [60:0]  sqrt_rem_add8_op1;         
+wire    [60:0]  sqrt_rem_add9_op1_0;       
+wire    [60:0]  sqrt_rem_add9_op1_1;       
+wire            sqrt_secd_round;           
+wire            sqrt_secd_round_sign;      
+wire            srt_div_clk;               
+wire            srt_div_clk_en;            
+wire            srt_first_round;           
+wire            srt_qt_rem_clk;            
+wire            srt_qt_rem_clk_en;         
+wire    [60:0]  srt_remainder_nxt;         
+wire    [59:0]  srt_remainder_out;         
+wire            srt_remainder_sign;        
+wire            srt_secd_round;            
+wire            srt_sm_on;                 
+wire    [57:0]  vdiv_qt_rt;                
+
+
+parameter  DATA_WIDTH = 56;
+parameter  REM_WIDTH  = 61;
+parameter  QT_WIDTH   = 58;
+//==========================================================
+//    SRT Remainder & Divisor for Quotient/Root Generate
+//==========================================================
+assign srt_qt_rem_clk_en = initial_srt_en || srt_sm_on;
+
+// &Instance("gated_clk_cell","x_srt_qt_rem_clk"); @35
+gated_clk_cell  x_srt_qt_rem_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (srt_qt_rem_clk    ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (srt_qt_rem_clk_en ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @36
+//           .clk_out        (srt_qt_rem_clk), @37
+//           .external_en    (1'b0), @38
+//           .global_en      (cp0_yy_clk_en), @39
+//           .local_en       (srt_qt_rem_clk_en), @40
+//           .module_en      (cp0_vfpu_icg_en) @41
+//         ); @42
+
+assign srt_div_clk_en = initial_srt_en;
+
+// &Instance("gated_clk_cell","x_srt_div_clk"); @46
+gated_clk_cell  x_srt_div_clk (
+  .clk_in             (forever_cpuclk    ),
+  .clk_out            (srt_div_clk       ),
+  .external_en        (1'b0              ),
+  .global_en          (cp0_yy_clk_en     ),
+  .local_en           (srt_div_clk_en    ),
+  .module_en          (cp0_vfpu_icg_en   ),
+  .pad_yy_icg_scan_en (pad_yy_icg_scan_en)
+);
+
+// &Connect( .clk_in         (forever_cpuclk), @47
+//           .clk_out        (srt_div_clk), @48
+//           .external_en    (1'b0), @49
+//           .global_en      (cp0_yy_clk_en), @50
+//           .local_en       (srt_div_clk_en), @51
+//           .module_en      (cp0_vfpu_icg_en) @52
+//         ); @53
+
+always @(posedge srt_qt_rem_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_remainder[REM_WIDTH-1:0] <= {REM_WIDTH{1'b0}};
+  else if(initial_srt_en)
+    srt_remainder[REM_WIDTH-1:0] <= initial_remainder_in[REM_WIDTH-1:0];
+  else if(srt_sm_on)
+    srt_remainder[REM_WIDTH-1:0] <= srt_remainder_nxt[REM_WIDTH-1:0];
+  else
+    srt_remainder[REM_WIDTH-1:0] <= srt_remainder[REM_WIDTH-1:0];
+end
+// &Force("output","srt_remainder"); @66
+always @(posedge srt_div_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_divisor[DATA_WIDTH-1:0] <= {DATA_WIDTH{1'b0}};
+  else if(initial_srt_en)
+    srt_divisor[DATA_WIDTH-1:0] <= initial_divisor_in[DATA_WIDTH-1:0];
+  else
+    srt_divisor[DATA_WIDTH-1:0] <= srt_divisor[DATA_WIDTH-1:0];
+end
+
+always @(posedge srt_qt_rem_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    bound_sel[6:0] <= {7{1'b0}};
+  else if(initial_srt_en)
+    bound_sel[6:0] <= initial_bound_sel_in[6:0];
+  else if(srt_sm_on && srt_sel_sqrt)
+    bound_sel[6:0] <= total_qt_rt_next[QT_WIDTH-2:QT_WIDTH-8];
+  else
+    bound_sel[6:0] <= bound_sel[6:0];
+end
+
+always @(posedge srt_div_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_sel_div <= 1'b0;
+  else if(initial_srt_en)
+    srt_sel_div <= initial_srt_sel_div_in;
+  else
+    srt_sel_div <= srt_sel_div;
+end
+
+always @(posedge srt_div_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+    srt_sel_sqrt <= 1'b0;
+  else if(initial_srt_en)
+    srt_sel_sqrt <= initial_srt_sel_sqrt_in;
+  else
+    srt_sel_sqrt <= srt_sel_sqrt;
+end
+
+always @(posedge srt_qt_rem_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)
+  begin
+    qt_rt_const_shift_std[QT_WIDTH-1:0] <= {QT_WIDTH{1'b0}};
+    total_qt_rt[QT_WIDTH-1:0]           <= {QT_WIDTH{1'b0}};
+    total_qt_rt_minus[QT_WIDTH-1:0]     <= {QT_WIDTH{1'b0}};
+  end
+  else if(initial_srt_en)
+  begin
+    qt_rt_const_shift_std[QT_WIDTH-1:0] <= {4'b0001,{(QT_WIDTH-4){1'b0}}};
+    total_qt_rt[QT_WIDTH-1:0]           <= {QT_WIDTH{1'b0}};
+    total_qt_rt_minus[QT_WIDTH-1:0]     <= {QT_WIDTH{1'b0}};
+  end
+  else if(srt_sm_on)
+  begin
+    qt_rt_const_shift_std[QT_WIDTH-1:0] <= qt_rt_const_shift_std_next[QT_WIDTH-1:0];
+    total_qt_rt[QT_WIDTH-1:0]           <= total_qt_rt_next[QT_WIDTH-1:0];
+    total_qt_rt_minus[QT_WIDTH-1:0]     <= total_qt_rt_minus_next[QT_WIDTH-1:0];
+  end
+  else
+  begin
+    qt_rt_const_shift_std[QT_WIDTH-1:0] <= qt_rt_const_shift_std[QT_WIDTH-1:0];
+    total_qt_rt[QT_WIDTH-1:0]           <= total_qt_rt[QT_WIDTH-1:0];
+    total_qt_rt_minus[QT_WIDTH-1:0]     <= total_qt_rt_minus[QT_WIDTH-1:0];
+  end
+end
+
+// &Force("output","total_qt_rt"); @137
+// &Force("output","vdiv_qt_rt"); @138
+
+assign vdiv_qt_rt[QT_WIDTH-1:0] = srt_remainder[REM_WIDTH-1] 
+                                  ? total_qt_rt_minus[QT_WIDTH-1:0]
+                                  : total_qt_rt[QT_WIDTH-1:0];
+
+assign qt_rt_const_shift_std_next[QT_WIDTH-1:0] = {4'b0, qt_rt_const_shift_std[QT_WIDTH-1:4]};
+
+//====================================================
+//  boundary  calculation
+//====================================================
+//assign bound_sel[6:0]   = srt_sel_div ? srt_divisor[DATA_WIDTH-1:DATA_WIDTH-7] 
+//                                      : total_qt_rt[QT_WIDTH-2:QT_WIDTH-8];
+
+// &Instance("ct_vfdsu_srt_radix16_bound_table"); @152
+ct_vfdsu_srt_radix16_bound_table  x_ct_vfdsu_srt_radix16_bound_table (
+  .bound_sel            (bound_sel           ),
+  .digit_bound_1        (digit_bound_1       ),
+  .digit_bound_2        (digit_bound_2       ),
+  .digit_bound_3        (digit_bound_3       ),
+  .digit_bound_4        (digit_bound_4       ),
+  .digit_bound_5        (digit_bound_5       ),
+  .digit_bound_6        (digit_bound_6       ),
+  .digit_bound_7        (digit_bound_7       ),
+  .digit_bound_8        (digit_bound_8       ),
+  .digit_bound_9        (digit_bound_9       ),
+  .sqrt_first_round     (sqrt_first_round    ),
+  .sqrt_secd_round      (sqrt_secd_round     ),
+  .sqrt_secd_round_sign (sqrt_secd_round_sign)
+);
+
+assign sqrt_first_round = srt_sel_sqrt && srt_first_round;
+assign sqrt_secd_round  = srt_sel_sqrt && srt_secd_round;
+assign sqrt_secd_round_sign = rem_sign;
+assign rem_sign                = srt_remainder[REM_WIDTH-1];
+assign part_rem[11:0]          = rem_sign
+                                 ? ~srt_remainder[REM_WIDTH-5:REM_WIDTH-16]
+                                 : srt_remainder[REM_WIDTH-5:REM_WIDTH-16];
+// &Force("nonport","bound1_cmp_result");  @160
+// &Force("nonport","bound2_cmp_result");                                @161
+// &Force("nonport","bound3_cmp_result");                                @162
+// &Force("nonport","bound4_cmp_result");                                @163
+// &Force("nonport","bound5_cmp_result");                                @164
+// &Force("nonport","bound6_cmp_result");                                @165
+// &Force("nonport","bound7_cmp_result");                                @166
+// &Force("nonport","bound8_cmp_result");                                @167
+// &Force("nonport","bound9_cmp_result");                                @168
+// &Force("bus","bound1_cmp_result",11,0); @169
+// &Force("bus","bound2_cmp_result",11,0); @170
+// &Force("bus","bound3_cmp_result",11,0); @171
+// &Force("bus","bound4_cmp_result",11,0); @172
+// &Force("bus","bound5_cmp_result",11,0); @173
+// &Force("bus","bound6_cmp_result",11,0); @174
+// &Force("bus","bound7_cmp_result",11,0); @175
+// &Force("bus","bound8_cmp_result",11,0); @176
+// &Force("bus","bound9_cmp_result",11,0); @177
+// &Force("nonport","digit_bound_1"); @178
+// &Force("nonport","digit_bound_2"); @179
+// &Force("nonport","digit_bound_3"); @180
+// &Force("nonport","digit_bound_4"); @181
+// &Force("nonport","digit_bound_5"); @182
+// &Force("nonport","digit_bound_6"); @183
+// &Force("nonport","digit_bound_7"); @184
+// &Force("nonport","digit_bound_8"); @185
+// &Force("nonport","digit_bound_9"); @186
+// &Force("nonport","part_rem"); @187
+////csky vperl_off                                 
+//assign bound1_cmp_result[11:0] = $unsigned($signed(part_rem[11:0]) 
+//                                         + $signed(digit_bound_1[11:0]));
+//assign bound2_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_2[11:0]));
+//assign bound3_cmp_result[11:0] = $unsigned($signed(part_rem[11:0]) 
+//                                         + $signed(digit_bound_3[11:0]));
+//assign bound4_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_4[11:0]));
+//assign bound5_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_5[11:0]));
+//assign bound6_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_6[11:0]));
+//assign bound7_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_7[11:0]));
+//assign bound8_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_8[11:0]));
+//assign bound9_cmp_result[11:0] = $unsigned($signed(part_rem[11:0])
+//                                         + $signed(digit_bound_9[11:0]));
+////csky vperl_on
+//assign bound1_cmp_sign         = bound1_cmp_result[11];
+//assign bound2_cmp_sign         = bound2_cmp_result[11];
+//assign bound3_cmp_sign         = bound3_cmp_result[11];
+//assign bound4_cmp_sign         = bound4_cmp_result[11];
+//assign bound5_cmp_sign         = bound5_cmp_result[11];
+//assign bound6_cmp_sign         = bound6_cmp_result[11];
+//assign bound7_cmp_sign         = bound7_cmp_result[11];
+//assign bound8_cmp_sign         = bound8_cmp_result[11];
+//assign bound9_cmp_sign         = bound9_cmp_result[11];
+assign bound1_cmp_sign         = part_rem[11:0] < digit_bound_1[11:0];
+assign bound2_cmp_sign         = part_rem[11:0] < digit_bound_2[11:0];
+assign bound3_cmp_sign         = part_rem[11:0] < digit_bound_3[11:0];
+assign bound4_cmp_sign         = part_rem[11:0] < digit_bound_4[11:0];
+assign bound5_cmp_sign         = part_rem[11:0] < digit_bound_5[11:0];
+assign bound6_cmp_sign         = part_rem[11:0] < digit_bound_6[11:0];
+assign bound7_cmp_sign         = part_rem[11:0] < digit_bound_7[11:0];
+assign bound8_cmp_sign         = part_rem[11:0] < digit_bound_8[11:0];
+assign bound9_cmp_sign         = part_rem[11:0] < digit_bound_9[11:0];
+//====================================================
+//  remainder calculation
+//====================================================
+// the root preparation
+assign qt_rt_const_q1[REM_WIDTH+1:0]   = {5'b0,qt_rt_const_shift_std[QT_WIDTH-1:0]};
+assign qt_rt_const_q2[REM_WIDTH+1:0]   = {4'b0,qt_rt_const_shift_std[QT_WIDTH-1:0],1'b0};
+assign qt_rt_const_q4[REM_WIDTH+1:0]   = {3'b0,qt_rt_const_shift_std[QT_WIDTH-1:0],2'b0};
+assign qt_rt_const_q8[REM_WIDTH+1:0]   = {2'b0,qt_rt_const_shift_std[QT_WIDTH-1:0],3'b0};
+assign qt_rt_const_q16[REM_WIDTH+1:0]  = {1'b0,qt_rt_const_shift_std[QT_WIDTH-1:0],4'b0};
+assign qt_rt_const_q32[REM_WIDTH+1:0]  =      {qt_rt_const_shift_std[QT_WIDTH-1:0],5'b0};
+assign qt_rt_const_q64[REM_WIDTH+1:0]  =      {qt_rt_const_shift_std[QT_WIDTH-2:0],6'b0};
+assign qt_rt_const_q128[REM_WIDTH+1:0] =      {qt_rt_const_shift_std[QT_WIDTH-3:0],7'b0};
+assign qt_rt_const_q3[REM_WIDTH+1:0]          =  qt_rt_const_q1[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q2[REM_WIDTH+1:0];
+assign qt_rt_const_q5[REM_WIDTH+1:0]          =  qt_rt_const_q1[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0];
+assign qt_rt_const_q6[REM_WIDTH+1:0]          =  qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0];
+assign qt_rt_const_q7[REM_WIDTH+1:0]          =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0];
+assign qt_rt_const_q9[REM_WIDTH+1:0]          =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q8[REM_WIDTH+1:0];
+assign qt_rt_const_q10[REM_WIDTH+1:0]         =  qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0];
+assign qt_rt_const_q11[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0];
+assign qt_rt_const_q12[REM_WIDTH+1:0]         =  qt_rt_const_q4[REM_WIDTH+1:0]
+                                                |qt_rt_const_q8[REM_WIDTH+1:0];              
+assign qt_rt_const_q13[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q4[REM_WIDTH+1:0]
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]; 
+assign qt_rt_const_q14[REM_WIDTH+1:0]         =  qt_rt_const_q2[REM_WIDTH+1:0]
+                                                |qt_rt_const_q4[REM_WIDTH+1:0]
+                                                |qt_rt_const_q8[REM_WIDTH+1:0];
+assign qt_rt_const_q15[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]; 
+assign qt_rt_const_q17[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                               |qt_rt_const_q16[REM_WIDTH+1:0];
+assign qt_rt_const_q23[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0] 
+                                               |qt_rt_const_q16[REM_WIDTH+1:0];
+assign qt_rt_const_q24[REM_WIDTH+1:0]         =  qt_rt_const_q8[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q16[REM_WIDTH+1:0];
+assign qt_rt_const_q27[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q16[REM_WIDTH+1:0];  
+assign qt_rt_const_q31[REM_WIDTH+1:0]         =  qt_rt_const_q1[REM_WIDTH+1:0]
+                                                |qt_rt_const_q2[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q4[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q16[REM_WIDTH+1:0];
+assign qt_rt_const_q44[REM_WIDTH+1:0]         =  qt_rt_const_q4[REM_WIDTH+1:0]
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q32[REM_WIDTH+1:0]; 
+assign qt_rt_const_q56[REM_WIDTH+1:0]         =  qt_rt_const_q8[REM_WIDTH+1:0]
+                                               |qt_rt_const_q16[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q32[REM_WIDTH+1:0]; 
+assign qt_rt_const_q60[REM_WIDTH+1:0]         =  qt_rt_const_q4[REM_WIDTH+1:0]   
+                                                |qt_rt_const_q8[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q16[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q32[REM_WIDTH+1:0]; 
+assign  qt_rt_const_q80[REM_WIDTH+1:0]       =  qt_rt_const_q16[REM_WIDTH+1:0]
+                                               |qt_rt_const_q64[REM_WIDTH+1:0]; 
+assign qt_rt_const_q112[REM_WIDTH+1:0]       =  qt_rt_const_q16[REM_WIDTH+1:0]   
+                                               |qt_rt_const_q32[REM_WIDTH+1:0]
+                                               |qt_rt_const_q64[REM_WIDTH+1:0]; 
+assign qt_rt_const_q192[REM_WIDTH+1:0]       =  qt_rt_const_q64[REM_WIDTH+1:0]
+                                              |qt_rt_const_q128[REM_WIDTH+1:0];
+//=====================================
+// the sqrt current remainder oprand b
+//=====================================
+// the root is negative
+// -1
+assign sqrt_qt_r1_rem_add_op1[REM_WIDTH-1:0]  = {4'b0,total_qt_rt_minus[QT_WIDTH-1:1]}
+                                                |qt_rt_const_q31[REM_WIDTH+1:2];
+//-2                                            
+assign sqrt_qt_r2_rem_add_op1[REM_WIDTH-1:0]  =  {3'b0,total_qt_rt_minus[QT_WIDTH-1:0]}
+                                                |qt_rt_const_q60[REM_WIDTH+1:2];
+//-4
+assign sqrt_qt_r4_rem_add_op1[REM_WIDTH-1:0]   = {2'b0,total_qt_rt_minus[QT_WIDTH-1:0],1'b0}
+                                                |qt_rt_const_q112[REM_WIDTH+1:2]; 
+//-8                                                
+assign sqrt_qt_r8_rem_add_op1[REM_WIDTH-1:0]   = {1'b0,total_qt_rt_minus[QT_WIDTH-1:0],2'b0}
+                                                |qt_rt_const_q192[REM_WIDTH+1:2]; 
+//-3
+assign sqrt_qt_r3_rem_add_op1_0[REM_WIDTH-1:0] = sqrt_qt_r2_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_qt_r3_rem_add_op1_1[REM_WIDTH-1:0] = {4'b0,total_qt_rt_minus[QT_WIDTH-1:1]}
+                                                |qt_rt_const_q27[REM_WIDTH+1:2];
+//-5 112+23q-i-1
+assign sqrt_qt_r5_rem_add_op1_0[REM_WIDTH-1:0] = sqrt_qt_r4_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_qt_r5_rem_add_op1_1[REM_WIDTH-1:0] = {4'b0,total_qt_rt_minus[QT_WIDTH-1:1]}
+                                                |qt_rt_const_q23[REM_WIDTH+1:2];
+//-6                                                
+assign sqrt_qt_r6_rem_add_op1_0[REM_WIDTH-1:0] = sqrt_qt_r4_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_qt_r6_rem_add_op1_1[REM_WIDTH-1:0] = {3'b0,total_qt_rt_minus[QT_WIDTH-1:0]}
+                                                |qt_rt_const_q44[REM_WIDTH+1:2];
+//-7  
+assign sqrt_qt_r7_rem_add_op1_0[REM_WIDTH-1:0] = sqrt_qt_r8_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_qt_r7_rem_add_op1_1[REM_WIDTH-1:0] = ~({4'b0,total_qt_rt_minus[QT_WIDTH-1:1]}
+                                                   |qt_rt_const_q17[REM_WIDTH+1:2]);
+//-9
+assign sqrt_qt_r9_rem_add_op1_0[REM_WIDTH-1:0] = sqrt_qt_r8_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_qt_r9_rem_add_op1_1[REM_WIDTH-1:0] = {4'b0,total_qt_rt_minus[QT_WIDTH-1:1]}
+                                                | qt_rt_const_q15[REM_WIDTH+1:2];
+// the root is positive
+// 1
+assign sqrt_qt_1_rem_add_op1[REM_WIDTH-1:0]    =  ~({4'b0,total_qt_rt[QT_WIDTH-1:1]} 
+                                                  | qt_rt_const_q1[REM_WIDTH+1:2]);           
+// 2
+assign sqrt_qt_2_rem_add_op1[REM_WIDTH-1:0]    = ~({3'b0,total_qt_rt[QT_WIDTH-1:0]} 
+                                                  | qt_rt_const_q4[REM_WIDTH+1:2]);
+// 4
+assign sqrt_qt_4_rem_add_op1[REM_WIDTH-1:0]    = ~({2'b0,total_qt_rt[QT_WIDTH-1:0],1'b0} 
+                                                  | qt_rt_const_q16[REM_WIDTH+1:2]);
+// 8
+assign sqrt_qt_8_rem_add_op1[REM_WIDTH-1:0]    = ~({1'b0,total_qt_rt[QT_WIDTH-1:0],2'b0} 
+                                                  | qt_rt_const_q64[REM_WIDTH+1:2]);  
+// 3
+assign sqrt_qt_3_rem_add_op1_0[REM_WIDTH-1:0]  = ~({3'b0,total_qt_rt[QT_WIDTH-1:0]}
+                                                  |qt_rt_const_q8[REM_WIDTH+1:2]);
+assign sqrt_qt_3_rem_add_op1_1[REM_WIDTH-1:0]  = sqrt_qt_1_rem_add_op1[REM_WIDTH-1:0];
+//5
+assign sqrt_qt_5_rem_add_op1_0[REM_WIDTH-1:0]  = ~({2'b0,total_qt_rt[QT_WIDTH-1:0],1'b0} 
+                                                  | qt_rt_const_q24[REM_WIDTH+1:2]);
+assign sqrt_qt_5_rem_add_op1_1[REM_WIDTH-1:0]  = sqrt_qt_1_rem_add_op1[REM_WIDTH-1:0];
+//6
+assign sqrt_qt_6_rem_add_op1_0[REM_WIDTH-1:0]  = ~({2'b0,total_qt_rt[QT_WIDTH-1:0],1'b0} 
+                                                  | qt_rt_const_q32[REM_WIDTH+1:2]);
+assign sqrt_qt_6_rem_add_op1_1[REM_WIDTH-1:0]  = sqrt_qt_2_rem_add_op1[REM_WIDTH-1:0];
+//7
+assign sqrt_qt_7_rem_add_op1_0[REM_WIDTH-1:0]  = ~({1'b0,total_qt_rt[QT_WIDTH-1:0],2'b0} 
+                                                  | qt_rt_const_q56[REM_WIDTH+1:2]);
+assign sqrt_qt_7_rem_add_op1_1[REM_WIDTH-1:0]  = {4'b0,total_qt_rt[QT_WIDTH-1:1]} 
+                                                  | qt_rt_const_q7[REM_WIDTH+1:2];
+//9
+assign sqrt_qt_9_rem_add_op1_0[REM_WIDTH-1:0]  = ~({1'b0,total_qt_rt[QT_WIDTH-1:0],2'b0} 
+                                                  | qt_rt_const_q80[REM_WIDTH+1:2]);
+assign sqrt_qt_9_rem_add_op1_1[REM_WIDTH-1:0]  = sqrt_qt_1_rem_add_op1[REM_WIDTH-1:0];
+
+assign sqrt_rem_add1_op1[REM_WIDTH-1:0]        = rem_sign 
+                                                 ? sqrt_qt_r1_rem_add_op1[REM_WIDTH-1:0]
+                                                 : sqrt_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_rem_add2_op1[REM_WIDTH-1:0]        = rem_sign
+                                                 ? sqrt_qt_r2_rem_add_op1[REM_WIDTH-1:0]
+                                                 : sqrt_qt_2_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_rem_add4_op1[REM_WIDTH-1:0]        = rem_sign 
+                                                 ? sqrt_qt_r4_rem_add_op1[REM_WIDTH-1:0]
+                                                 : sqrt_qt_4_rem_add_op1[REM_WIDTH-1:0];
+assign sqrt_rem_add8_op1[REM_WIDTH-1:0]        = rem_sign
+                                                 ? sqrt_qt_r8_rem_add_op1[REM_WIDTH-1:0]
+                                                 : sqrt_qt_8_rem_add_op1[REM_WIDTH-1:0];     
+assign sqrt_rem_add3_op1_0[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r3_rem_add_op1_0[REM_WIDTH-1:0]
+                                                : sqrt_qt_3_rem_add_op1_0[REM_WIDTH-1:0];
+assign sqrt_rem_add3_op1_1[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r3_rem_add_op1_1[REM_WIDTH-1:0]
+                                                : sqrt_qt_3_rem_add_op1_1[REM_WIDTH-1:0];
+assign sqrt_rem_add5_op1_0[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r5_rem_add_op1_0[REM_WIDTH-1:0]
+                                                : sqrt_qt_5_rem_add_op1_0[REM_WIDTH-1:0];
+assign sqrt_rem_add5_op1_1[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r5_rem_add_op1_1[REM_WIDTH-1:0]
+                                                : sqrt_qt_5_rem_add_op1_1[REM_WIDTH-1:0];
+assign sqrt_rem_add6_op1_0[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r6_rem_add_op1_0[REM_WIDTH-1:0]
+                                                : sqrt_qt_6_rem_add_op1_0[REM_WIDTH-1:0];
+assign sqrt_rem_add6_op1_1[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r6_rem_add_op1_1[REM_WIDTH-1:0]
+                                                : sqrt_qt_6_rem_add_op1_1[REM_WIDTH-1:0];
+assign sqrt_rem_add7_op1_0[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r7_rem_add_op1_0[REM_WIDTH-1:0]
+                                                : sqrt_qt_7_rem_add_op1_0[REM_WIDTH-1:0];
+assign sqrt_rem_add7_op1_1[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r7_rem_add_op1_1[REM_WIDTH-1:0]
+                                                : sqrt_qt_7_rem_add_op1_1[REM_WIDTH-1:0];
+assign sqrt_rem_add9_op1_0[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r9_rem_add_op1_0[REM_WIDTH-1:0]
+                                                : sqrt_qt_9_rem_add_op1_0[REM_WIDTH-1:0];
+assign sqrt_rem_add9_op1_1[REM_WIDTH-1:0]      = rem_sign 
+                                                ? sqrt_qt_r9_rem_add_op1_1[REM_WIDTH-1:0]
+                                                : sqrt_qt_9_rem_add_op1_1[REM_WIDTH-1:0];
+
+//=====================================
+// the div current remainder oprand b
+//=====================================
+//negative
+assign div_qt_r1_rem_add_op1[REM_WIDTH-1:0]    = {5'b0,srt_divisor[DATA_WIDTH-1:0]};
+assign div_qt_r2_rem_add_op1[REM_WIDTH-1:0]    = {4'b0,srt_divisor[DATA_WIDTH-1:0],1'b0};
+assign div_qt_r4_rem_add_op1[REM_WIDTH-1:0]    = {3'b0,srt_divisor[DATA_WIDTH-1:0],2'b0};
+assign div_qt_r8_rem_add_op1[REM_WIDTH-1:0]    = {2'b0,srt_divisor[DATA_WIDTH-1:0],3'b0};
+assign div_qt_r3_rem_add_op1_0[REM_WIDTH-1:0]  = div_qt_r2_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r3_rem_add_op1_1[REM_WIDTH-1:0]  = div_qt_r1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r5_rem_add_op1_0[REM_WIDTH-1:0]  = div_qt_r4_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r5_rem_add_op1_1[REM_WIDTH-1:0]  = div_qt_r1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r6_rem_add_op1_0[REM_WIDTH-1:0]  = div_qt_r4_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r6_rem_add_op1_1[REM_WIDTH-1:0]  = div_qt_r2_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r7_rem_add_op1_0[REM_WIDTH-1:0]  = div_qt_r8_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r7_rem_add_op1_1[REM_WIDTH-1:0]  =~div_qt_r1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r9_rem_add_op1_0[REM_WIDTH-1:0]  = div_qt_r8_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_r9_rem_add_op1_1[REM_WIDTH-1:0]  = div_qt_r1_rem_add_op1[REM_WIDTH-1:0];
+//positive 
+assign div_qt_1_rem_add_op1[REM_WIDTH-1:0]     =~div_qt_r1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_2_rem_add_op1[REM_WIDTH-1:0]     =~div_qt_r2_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_4_rem_add_op1[REM_WIDTH-1:0]     =~div_qt_r4_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_8_rem_add_op1[REM_WIDTH-1:0]     =~div_qt_r8_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_3_rem_add_op1_0[REM_WIDTH-1:0]   = div_qt_2_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_3_rem_add_op1_1[REM_WIDTH-1:0]   = div_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_5_rem_add_op1_0[REM_WIDTH-1:0]   = div_qt_4_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_5_rem_add_op1_1[REM_WIDTH-1:0]   = div_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_6_rem_add_op1_0[REM_WIDTH-1:0]   = div_qt_4_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_6_rem_add_op1_1[REM_WIDTH-1:0]   = div_qt_2_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_7_rem_add_op1_0[REM_WIDTH-1:0]   = div_qt_8_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_7_rem_add_op1_1[REM_WIDTH-1:0]   = ~div_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_9_rem_add_op1_0[REM_WIDTH-1:0]   = div_qt_8_rem_add_op1[REM_WIDTH-1:0];
+assign div_qt_9_rem_add_op1_1[REM_WIDTH-1:0]   = div_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign div_rem_add1_op1[REM_WIDTH-1:0]         = rem_sign ? div_qt_r1_rem_add_op1[REM_WIDTH-1:0]
+                                                          : div_qt_1_rem_add_op1[REM_WIDTH-1:0];
+assign div_rem_add2_op1[REM_WIDTH-1:0]         = rem_sign ? div_qt_r2_rem_add_op1[REM_WIDTH-1:0]
+                                                          : div_qt_2_rem_add_op1[REM_WIDTH-1:0];
+assign div_rem_add4_op1[REM_WIDTH-1:0]         = rem_sign ? div_qt_r4_rem_add_op1[REM_WIDTH-1:0]
+                                                          : div_qt_4_rem_add_op1[REM_WIDTH-1:0];
+assign div_rem_add8_op1[REM_WIDTH-1:0]         = rem_sign ? div_qt_r8_rem_add_op1[REM_WIDTH-1:0]
+                                                          : div_qt_8_rem_add_op1[REM_WIDTH-1:0];
+assign div_rem_add3_op1_0[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r3_rem_add_op1_0[REM_WIDTH-1:0]
+                                                 : div_qt_3_rem_add_op1_0[REM_WIDTH-1:0];
+assign div_rem_add3_op1_1[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r3_rem_add_op1_1[REM_WIDTH-1:0]
+                                                 : div_qt_3_rem_add_op1_1[REM_WIDTH-1:0];
+assign div_rem_add5_op1_0[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r5_rem_add_op1_0[REM_WIDTH-1:0]
+                                                 : div_qt_5_rem_add_op1_0[REM_WIDTH-1:0];
+assign div_rem_add5_op1_1[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r5_rem_add_op1_1[REM_WIDTH-1:0]
+                                                 : div_qt_5_rem_add_op1_1[REM_WIDTH-1:0];
+assign div_rem_add6_op1_0[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r6_rem_add_op1_0[REM_WIDTH-1:0]
+                                                 : div_qt_6_rem_add_op1_0[REM_WIDTH-1:0];
+assign div_rem_add6_op1_1[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r6_rem_add_op1_1[REM_WIDTH-1:0]
+                                                 : div_qt_6_rem_add_op1_1[REM_WIDTH-1:0];
+assign div_rem_add7_op1_0[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r7_rem_add_op1_0[REM_WIDTH-1:0]
+                                                 : div_qt_7_rem_add_op1_0[REM_WIDTH-1:0];
+assign div_rem_add7_op1_1[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r7_rem_add_op1_1[REM_WIDTH-1:0]
+                                                 : div_qt_7_rem_add_op1_1[REM_WIDTH-1:0];
+assign div_rem_add9_op1_0[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r9_rem_add_op1_0[REM_WIDTH-1:0]
+                                                 : div_qt_9_rem_add_op1_0[REM_WIDTH-1:0];
+assign div_rem_add9_op1_1[REM_WIDTH-1:0]       = rem_sign 
+                                                 ? div_qt_r9_rem_add_op1_1[REM_WIDTH-1:0]
+                                                 : div_qt_9_rem_add_op1_1[REM_WIDTH-1:0];
+//=====================================
+// the remainder calculation
+//=====================================
+assign rem_add1_op1[REM_WIDTH-1:0]     = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add1_op1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add1_op1[REM_WIDTH-1:0]);
+assign rem_add2_op1[REM_WIDTH-1:0]     = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add2_op1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add2_op1[REM_WIDTH-1:0]);
+assign rem_add4_op1[REM_WIDTH-1:0]     = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add4_op1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add4_op1[REM_WIDTH-1:0]);
+assign rem_add8_op1[REM_WIDTH-1:0]     = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add8_op1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add8_op1[REM_WIDTH-1:0]);
+assign rem_add3_op1_0[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add3_op1_0[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add3_op1_0[REM_WIDTH-1:0]);
+assign rem_add3_op1_1[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add3_op1_1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add3_op1_1[REM_WIDTH-1:0]);
+assign rem_add5_op1_0[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add5_op1_0[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add5_op1_0[REM_WIDTH-1:0]);
+assign rem_add5_op1_1[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add5_op1_1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add5_op1_1[REM_WIDTH-1:0]);
+assign rem_add6_op1_0[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add6_op1_0[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add6_op1_0[REM_WIDTH-1:0]);
+assign rem_add6_op1_1[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add6_op1_1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add6_op1_1[REM_WIDTH-1:0]);
+assign rem_add7_op1_0[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add7_op1_0[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add7_op1_0[REM_WIDTH-1:0]);
+assign rem_add7_op1_1[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add7_op1_1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add7_op1_1[REM_WIDTH-1:0]);
+assign rem_add9_op1_0[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add9_op1_0[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add9_op1_0[REM_WIDTH-1:0]);
+assign rem_add9_op1_1[REM_WIDTH-1:0]   = ({REM_WIDTH{srt_sel_div}}  &  div_rem_add9_op1_1[REM_WIDTH-1:0])
+                                        |({REM_WIDTH{srt_sel_sqrt}} & sqrt_rem_add9_op1_1[REM_WIDTH-1:0]);
+// remainder calculation for all of the remainders
+assign remainder_shift[REM_WIDTH-1:0]  = {srt_remainder[REM_WIDTH-5:0],4'b0};
+// &Force("nonport","cur_rem_1"); @518
+// &Force("nonport","cur_rem_2"); @519
+// &Force("nonport","cur_rem_3"); @520
+// &Force("nonport","cur_rem_4"); @521
+// &Force("nonport","cur_rem_5"); @522
+// &Force("nonport","cur_rem_6"); @523
+// &Force("nonport","cur_rem_7"); @524
+// &Force("nonport","cur_rem_8"); @525
+// &Force("nonport","cur_rem_9"); @526
+// &Force("nonport","remainder_shift"); @527
+// &Force("nonport","rem_add1_op1"); @528
+// &Force("nonport","rem_add2_op1"); @529
+// &Force("nonport","rem_add3_op1_0"); @530
+// &Force("nonport","rem_add3_op1_1"); @531
+// &Force("nonport","rem_add4_op1"); @532
+// &Force("nonport","rem_add5_op1_0"); @533
+// &Force("nonport","rem_add5_op1_1"); @534
+// &Force("nonport","rem_add6_op1_0"); @535
+// &Force("nonport","rem_add6_op1_1"); @536
+// &Force("nonport","rem_add7_op1_0"); @537
+// &Force("nonport","rem_add7_op1_1"); @538
+// &Force("nonport","rem_add8_op1"); @539
+// &Force("nonport","rem_add9_op1_0"); @540
+// &Force("nonport","rem_add9_op1_1"); @541
+//csky vperl_off
+assign cur_rem_1[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add1_op1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-1{1'b0}},~rem_sign}));   
+assign cur_rem_2[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add2_op1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-1{1'b0}},~rem_sign}));  
+assign cur_rem_4[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add4_op1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-1{1'b0}},~rem_sign}));   
+assign cur_rem_8[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add8_op1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-1{1'b0}},~rem_sign}));  
+assign cur_rem_3[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add3_op1_0[REM_WIDTH-1:0])
+                                         + $signed(rem_add3_op1_1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-2{1'b0}},~rem_sign,1'b0}));
+assign cur_rem_5[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add5_op1_0[REM_WIDTH-1:0])
+                                         + $signed(rem_add5_op1_1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-2{1'b0}},~rem_sign,1'b0}));
+assign cur_rem_6[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add6_op1_0[REM_WIDTH-1:0])
+                                         + $signed(rem_add6_op1_1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-2{1'b0}},~rem_sign,1'b0}));
+assign cur_rem_7[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add7_op1_0[REM_WIDTH-1:0])
+                                         + $signed(rem_add7_op1_1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-1{1'b0}},1'b1}));
+assign cur_rem_9[REM_WIDTH-1:0]        = $unsigned($signed(remainder_shift[REM_WIDTH-1:0])
+                                         + $signed(rem_add9_op1_0[REM_WIDTH-1:0])
+                                         + $signed(rem_add9_op1_1[REM_WIDTH-1:0])
+                                         + $signed({{REM_WIDTH-2{1'b0}},~rem_sign,1'b0}));
+//csky vperl_on
+//====================================================
+//  quotient selection
+//==================================================== 
+assign bound_cmp_sign[8:0] =  {bound1_cmp_sign,bound2_cmp_sign,bound3_cmp_sign,bound4_cmp_sign,
+             bound5_cmp_sign,bound6_cmp_sign,bound7_cmp_sign,bound8_cmp_sign,bound9_cmp_sign};
+
+// &CombBeg; @582
+always @( rem_sign
+       or bound_cmp_sign[8:0]
+       or qt_rt_const_q10[57:0]
+       or qt_rt_const_q6[57:0]
+       or qt_rt_const_q12[57:0]
+       or qt_rt_const_q13[57:0]
+       or qt_rt_const_q5[57:0]
+       or qt_rt_const_q9[57:0]
+       or qt_rt_const_q8[57:0]
+       or total_qt_rt_minus[57:0]
+       or qt_rt_const_q4[57:0]
+       or qt_rt_const_q2[57:0]
+       or qt_rt_const_q15[57:0]
+       or qt_rt_const_q1[57:0]
+       or qt_rt_const_q7[57:0]
+       or qt_rt_const_q14[57:0]
+       or qt_rt_const_q3[57:0]
+       or qt_rt_const_q11[57:0]
+       or total_qt_rt[57:0])
+begin
+case({rem_sign,bound_cmp_sign[8:0]})
+  10'b0111111111: //0
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q15[QT_WIDTH-1:0];
+  end
+  10'b0011111111: //1
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q1[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0];
+  end
+  10'b0001111111://2
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q2[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q1[QT_WIDTH-1:0];
+  end
+  10'b0000111111://3
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q3[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q2[QT_WIDTH-1:0];
+  end
+  10'b0000011111://4
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q4[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q3[QT_WIDTH-1:0];
+  end
+  10'b0000001111://5
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q5[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q4[QT_WIDTH-1:0];
+  end
+  10'b0000000111://6
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q6[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q5[QT_WIDTH-1:0];
+  end
+  10'b0000000011://7
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q7[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q6[QT_WIDTH-1:0];
+  end
+  10'b0000000001://8
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q8[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q7[QT_WIDTH-1:0];
+  end
+  10'b0000000000://9
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q9[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt[QT_WIDTH-1:0]
+                                            |qt_rt_const_q8[QT_WIDTH-1:0];
+  end
+  10'b1111111111: //0
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q15[QT_WIDTH-1:0];
+  end
+  10'b1011111111: //-1
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q15[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q14[QT_WIDTH-1:0];
+  end
+  10'b1001111111://-2
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q14[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q13[QT_WIDTH-1:0];
+  end
+  10'b1000111111://-3
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q13[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q12[QT_WIDTH-1:0];
+  end
+  10'b1000011111://-4
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q12[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q11[QT_WIDTH-1:0];
+  end
+  10'b1000001111://-5
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q11[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q10[QT_WIDTH-1:0];
+  end
+  10'b1000000111://-6
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q10[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q9[QT_WIDTH-1:0];
+  end
+  10'b1000000011://-7
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q9[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q8[QT_WIDTH-1:0];
+  end
+  10'b1000000001://-8
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q8[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q7[QT_WIDTH-1:0];
+  end
+  10'b1000000000://-9
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q7[QT_WIDTH-1:0];
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = total_qt_rt_minus[QT_WIDTH-1:0]
+                                            |qt_rt_const_q6[QT_WIDTH-1:0];
+  end
+  default :
+  begin
+    total_qt_rt_next[QT_WIDTH-1:0]       = {QT_WIDTH{1'bx}};
+    total_qt_rt_minus_next[QT_WIDTH-1:0] = {QT_WIDTH{1'bx}};
+  end
+endcase
+// &CombEnd; @727
+end
+//====================================================
+//  remainder selection
+//====================================================
+// &CombBeg; @731
+always @( cur_rem_2[60:0]
+       or remainder_shift[60:0]
+       or bound_cmp_sign[8:0]
+       or cur_rem_6[60:0]
+       or cur_rem_8[60:0]
+       or cur_rem_3[60:0]
+       or cur_rem_7[60:0]
+       or cur_rem_4[60:0]
+       or cur_rem_5[60:0]
+       or cur_rem_9[60:0]
+       or cur_rem_1[60:0])
+begin
+case(bound_cmp_sign[8:0])
+  9'b111111111: cur_rem[REM_WIDTH-1:0]   = remainder_shift[REM_WIDTH-1:0]; //0
+  9'b011111111: cur_rem[REM_WIDTH-1:0]   = cur_rem_1[REM_WIDTH-1:0];       //+-1          
+  9'b001111111: cur_rem[REM_WIDTH-1:0]   = cur_rem_2[REM_WIDTH-1:0];       //+-2          
+  9'b000111111: cur_rem[REM_WIDTH-1:0]   = cur_rem_3[REM_WIDTH-1:0];       //+-3          
+  9'b000011111: cur_rem[REM_WIDTH-1:0]   = cur_rem_4[REM_WIDTH-1:0];       //+-4          
+  9'b000001111: cur_rem[REM_WIDTH-1:0]   = cur_rem_5[REM_WIDTH-1:0];       //+-5          
+  9'b000000111: cur_rem[REM_WIDTH-1:0]   = cur_rem_6[REM_WIDTH-1:0];       //+-6          
+  9'b000000011: cur_rem[REM_WIDTH-1:0]   = cur_rem_7[REM_WIDTH-1:0];       //+-7          
+  9'b000000001: cur_rem[REM_WIDTH-1:0]   = cur_rem_8[REM_WIDTH-1:0];       //+-8          
+  9'b000000000: cur_rem[REM_WIDTH-1:0]   = cur_rem_9[REM_WIDTH-1:0];       //+-9          
+  default :     cur_rem[REM_WIDTH-1:0]   = {REM_WIDTH{1'bx}};
+endcase
+// &CombEnd; @745
+end
+assign srt_remainder_nxt[REM_WIDTH-1:0]   = cur_rem[REM_WIDTH-1:0];
+//assign srt_remainder_zero                 = ~|srt_remainder_nxt[REM_WIDTH-1:0];
+assign srt_remainder_sign                 = srt_remainder_nxt[REM_WIDTH-1];
+
+//====================================================
+//  remainder logic for integer VREM/VREMU inst
+//====================================================
+always @(posedge srt_qt_rem_clk or negedge cpurst_b)
+begin
+  if(!cpurst_b)begin
+    srt_remainder_minus[REM_WIDTH-1:0]  <= {REM_WIDTH{1'b0}};
+  end
+  else if(srt_sm_on)begin
+    srt_remainder_minus[REM_WIDTH-1:0]  <= srt_remainder_minus_nxt[REM_WIDTH-1:0];
+  end
+  else begin
+    srt_remainder_minus[REM_WIDTH-1:0]  <= srt_remainder_minus[REM_WIDTH-1:0];
+  end
+end
+
+assign srt_remainder_out[REM_WIDTH-2:0] = srt_remainder[REM_WIDTH-1] ? srt_remainder_minus[REM_WIDTH-2:0]
+                                                                     : srt_remainder[REM_WIDTH-2:0];
+
+assign remainder_minus_shift[REM_WIDTH-1:0] = {srt_remainder_minus[REM_WIDTH-5:0],4'b0};
+//csky vperl_off
+assign rem_minus_minus_6[REM_WIDTH-1:0]      = $unsigned($signed(remainder_minus_shift[REM_WIDTH-1:0])
+                                             - $signed({div_qt_r4_rem_add_op1[REM_WIDTH-1:0]})
+                                             - $signed({div_qt_r2_rem_add_op1[REM_WIDTH-1:0]}));
+//assign rem_minus_minus_4[REM_WIDTH-1:0]      = $unsigned($signed(remainder_minus_shift[REM_WIDTH-1:0])
+//                                             - $signed({div_qt_r4_rem_add_op1[REM_WIDTH-1:0]}));
+//csky vperl_on
+
+// &Force("nonport","rem_minus_minus_6"); @778
+// //&Force("nonport","rem_minus_minus_4"); @779
+
+// here add for positive remainder calculation
+assign remainder_minus_nor_nxt_0[REM_WIDTH-1:0]  = rem_sign ? cur_rem_1[REM_WIDTH-1:0]
+						            : remainder_minus_shift[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_1[REM_WIDTH-1:0]  = rem_sign ? cur_rem_2[REM_WIDTH-1:0]
+                                                            : remainder_shift[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_2[REM_WIDTH-1:0]  = rem_sign ? cur_rem_3[REM_WIDTH-1:0]
+                                                            : cur_rem_1[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_3[REM_WIDTH-1:0]  = rem_sign ? cur_rem_4[REM_WIDTH-1:0]
+                                                            : cur_rem_2[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_4[REM_WIDTH-1:0]  = rem_sign ? cur_rem_5[REM_WIDTH-1:0]
+                                                            : cur_rem_3[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_5[REM_WIDTH-1:0]  = rem_sign ? cur_rem_6[REM_WIDTH-1:0]
+                                                            : cur_rem_4[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_6[REM_WIDTH-1:0]  = rem_sign ? cur_rem_7[REM_WIDTH-1:0]
+                                                            : cur_rem_5[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_7[REM_WIDTH-1:0]  = rem_sign ? cur_rem_8[REM_WIDTH-1:0]
+                                                            : cur_rem_6[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_8[REM_WIDTH-1:0]  = rem_sign ? cur_rem_9[REM_WIDTH-1:0]
+                                                            : cur_rem_7[REM_WIDTH-1:0];
+assign remainder_minus_nor_nxt_9[REM_WIDTH-1:0]  = rem_sign ? rem_minus_minus_6[REM_WIDTH-1:0]
+                                                            : cur_rem_8[REM_WIDTH-1:0];
+// &CombBeg;                       @802
+always @( bound_cmp_sign[8:0]
+       or remainder_minus_nor_nxt_7[60:0]
+       or remainder_minus_nor_nxt_1[60:0]
+       or remainder_minus_nor_nxt_4[60:0]
+       or remainder_minus_nor_nxt_5[60:0]
+       or remainder_minus_nor_nxt_2[60:0]
+       or remainder_minus_nor_nxt_9[60:0]
+       or remainder_minus_nor_nxt_8[60:0]
+       or remainder_minus_nor_nxt_0[60:0]
+       or remainder_minus_nor_nxt_3[60:0]
+       or remainder_minus_nor_nxt_6[60:0])
+begin
+case({bound_cmp_sign[8:0]})
+  9'b111111111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_0[REM_WIDTH-1:0];//0
+  9'b011111111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_1[REM_WIDTH-1:0];//+-1
+  9'b001111111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_2[REM_WIDTH-1:0];//+-2
+  9'b000111111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_3[REM_WIDTH-1:0];//+-3
+  9'b000011111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_4[REM_WIDTH-1:0];//+-4
+  9'b000001111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_5[REM_WIDTH-1:0];//+-5
+  9'b000000111: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_6[REM_WIDTH-1:0];//+-6
+  9'b000000011: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_7[REM_WIDTH-1:0];//+-7
+  9'b000000001: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_8[REM_WIDTH-1:0];//+-8
+  9'b000000000: srt_remainder_minus_nxt[REM_WIDTH-1:0] = remainder_minus_nor_nxt_9[REM_WIDTH-1:0];//+-9
+  default :     srt_remainder_minus_nxt[REM_WIDTH-1:0] = {REM_WIDTH{1'bx}};
+endcase
+// &CombEnd; @816
+end
+
+// &ModuleEnd; @818
+endmodule
+
+
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
new file mode 100644
index 00000000..f8846255
--- /dev/null
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
@@ -0,0 +1,331 @@
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// &Depend("cpu_cfig.h"); @22
+// &ModuleBeg; @23
+module ct_vfdsu_top(
+  cp0_vfpu_icg_en,
+  cp0_yy_clk_en,
+  cpurst_b,
+  dp_vfdsu_ex1_pipex_dst_ereg,
+  dp_vfdsu_ex1_pipex_dst_vreg,
+  dp_vfdsu_ex1_pipex_iid,
+  dp_vfdsu_ex1_pipex_imm0,
+  dp_vfdsu_ex1_pipex_sel,
+  dp_vfdsu_ex1_pipex_srcf0,
+  dp_vfdsu_ex1_pipex_srcf1,
+  dp_vfdsu_fdiv_gateclk_issue,
+  dp_vfdsu_idu_fdiv_issue,
+  forever_cpuclk,
+  idu_vfpu_rf_pipex_func,
+  idu_vfpu_rf_pipex_gateclk_sel,
+  pad_yy_icg_scan_en,
+  pipex_dp_vfdsu_ereg,
+  pipex_dp_vfdsu_ereg_data,
+  pipex_dp_vfdsu_freg_data,
+  pipex_dp_vfdsu_inst_vld,
+  pipex_dp_vfdsu_vreg,
+  rtu_yy_xx_flush,
+  vfdsu_dp_fdiv_busy,
+  vfdsu_dp_inst_wb_req,
+  vfdsu_ifu_debug_ex2_wait,
+  vfdsu_ifu_debug_idle,
+  vfdsu_ifu_debug_pipe_busy,
+  vfpu_yy_xx_dqnan,
+  vfpu_yy_xx_rm
+);
+
+// &Ports; @24
+input           cp0_vfpu_icg_en;              
+input           cp0_yy_clk_en;                
+input           cpurst_b;                     
+input   [4 :0]  dp_vfdsu_ex1_pipex_dst_ereg;  
+input   [6 :0]  dp_vfdsu_ex1_pipex_dst_vreg;  
+input   [6 :0]  dp_vfdsu_ex1_pipex_iid;       
+input   [2 :0]  dp_vfdsu_ex1_pipex_imm0;      
+input           dp_vfdsu_ex1_pipex_sel;       
+input   [63:0]  dp_vfdsu_ex1_pipex_srcf0;     
+input   [63:0]  dp_vfdsu_ex1_pipex_srcf1;     
+input           dp_vfdsu_fdiv_gateclk_issue;  
+input           dp_vfdsu_idu_fdiv_issue;      
+input           forever_cpuclk;               
+input   [19:0]  idu_vfpu_rf_pipex_func;       
+input           idu_vfpu_rf_pipex_gateclk_sel; 
+input           pad_yy_icg_scan_en;           
+input           rtu_yy_xx_flush;              
+input           vfpu_yy_xx_dqnan;             
+input   [2 :0]  vfpu_yy_xx_rm;                
+output  [4 :0]  pipex_dp_vfdsu_ereg;          
+output  [4 :0]  pipex_dp_vfdsu_ereg_data;     
+output  [63:0]  pipex_dp_vfdsu_freg_data;     
+output          pipex_dp_vfdsu_inst_vld;      
+output  [6 :0]  pipex_dp_vfdsu_vreg;          
+output          vfdsu_dp_fdiv_busy;           
+output          vfdsu_dp_inst_wb_req;         
+output          vfdsu_ifu_debug_ex2_wait;     
+output          vfdsu_ifu_debug_idle;         
+output          vfdsu_ifu_debug_pipe_busy;    
+
+// &Regs; @25
+
+// &Wires; @26
+wire            cp0_vfpu_icg_en;              
+wire            cp0_yy_clk_en;                
+wire            cpurst_b;                     
+wire    [4 :0]  dp_vfdsu_ex1_pipex_dst_ereg;  
+wire    [6 :0]  dp_vfdsu_ex1_pipex_dst_vreg;  
+wire    [6 :0]  dp_vfdsu_ex1_pipex_iid;       
+wire    [2 :0]  dp_vfdsu_ex1_pipex_imm0;      
+wire            dp_vfdsu_ex1_pipex_sel;       
+wire    [63:0]  dp_vfdsu_ex1_pipex_srcf0;     
+wire    [63:0]  dp_vfdsu_ex1_pipex_srcf1;     
+wire            dp_vfdsu_fdiv_gateclk_issue;  
+wire            dp_vfdsu_idu_fdiv_issue;      
+wire            ex1_data_clk;                 
+wire            ex1_div;                      
+wire            ex1_double;                   
+wire            ex1_pipedown;                 
+wire            ex1_scalar;                   
+wire            ex1_single;                   
+wire            ex1_sqrt;                     
+wire    [63:0]  ex1_src0;                     
+wire    [63:0]  ex1_src1;                     
+wire    [2 :0]  ex1_static_rm;                
+wire            ex2_data_clk;                 
+wire            ex2_pipedown;                 
+wire            ex2_srt_first_round;          
+wire            ex3_data_clk;                 
+wire            ex3_pipedown;                 
+wire    [4 :0]  ex4_out_expt;                 
+wire    [63:0]  ex4_out_result;               
+wire            forever_cpuclk;               
+wire    [19:0]  idu_vfpu_rf_pipex_func;       
+wire            idu_vfpu_rf_pipex_gateclk_sel; 
+wire            pad_yy_icg_scan_en;           
+wire    [4 :0]  pipex_dp_vfdsu_ereg;          
+wire    [4 :0]  pipex_dp_vfdsu_ereg_data;     
+wire    [63:0]  pipex_dp_vfdsu_freg_data;     
+wire            pipex_dp_vfdsu_inst_vld;      
+wire    [6 :0]  pipex_dp_vfdsu_vreg;          
+wire            rtu_yy_xx_flush;              
+wire            srt_ctrl_rem_zero;            
+wire            srt_ctrl_skip_srt;            
+wire            srt_secd_round;               
+wire            srt_sm_on;                    
+wire            vfdsu_dp_fdiv_busy;           
+wire            vfdsu_dp_inst_wb_req;         
+wire            vfdsu_ex2_double;             
+wire            vfdsu_ex2_single;             
+wire            vfdsu_ifu_debug_ex2_wait;     
+wire            vfdsu_ifu_debug_idle;         
+wire            vfdsu_ifu_debug_pipe_busy;    
+wire            vfpu_yy_xx_dqnan;             
+wire    [2 :0]  vfpu_yy_xx_rm;                
+
+// &Instance("ct_vfdsu_ctrl"); @28
+// &Instance("ct_vfdsu_dp"); @29
+// &ConnRule(s/ex4_out/set0_doub_ex4/); @30
+// &ConnRule(s/srt_ctrl/set0_doub_srt_ctrl/); @31
+// &ConnRule(s/vfdsu_ex2_/dp_set0_double_ex2_/); @32
+// &ConnRule(s/slice_x/slice_0/); @33
+// &ConnRule(s/vfdsu_ex3_/dp_set0_double_ex3_/); @34
+// &ConnRule(s/vfdsu_ex4_/dp_set0_double_ex4_/); @35
+// &Instance("ct_vfdsu_double","x_ct_vfdsu_double_set0"); @36
+// &Connect(.ex1_src0(ex1_src0[63:0])); @37
+// &Connect(.ex1_src1(ex1_src1[63:0])); @38
+// &Connect(.ex1_double(set0_ex1_double)); @39
+// &Connect(.srt_secd_round(srt_secd_round[0])); @40
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[0])); @41
+// &ConnRule(s/ex4_out/set0_half0_ex4/); @43
+// &ConnRule(s/_pipedown/_half_pipedown/); @44
+// &ConnRule(s/srt_ctrl/set0_half0_srt_ctrl/); @45
+// &ConnRule(s/vfdsu_ex2_/dp_set0_half0_ex2_/); @46
+// &ConnRule(s/vfdsu_ex3_/dp_set0_half0_ex3_/); @47
+// &ConnRule(s/vfdsu_ex4_/dp_set0_half0_ex4_/); @48
+// &Instance("ct_vfdsu_half","x_ct_vfdsu_half0_set0"); @49
+// &Connect(.ex1_src0(ex1_src0[31:16])); @50
+// &Connect(.ex1_src1(ex1_src1[31:16])); @51
+// &Connect(.srt_secd_round(srt_secd_round[1])); @52
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[1])); @53
+// &ConnRule(s/ex4_out/set0_sing_ex4/); @56
+// &ConnRule(s/_pipedown/_sing_pipedown/); @57
+// &ConnRule(s/srt_ctrl/set0_sing_srt_ctrl/); @58
+// &ConnRule(s/slice_x/slice_0/); @59
+// &Instance("ct_vfdsu_single","x_ct_vfdsu_single_set0"); @60
+// &Connect(.ex1_src0(ex1_src0[63:32])); @61
+// &Connect(.ex1_src1(ex1_src1[63:32])); @62
+// &Connect(.srt_secd_round(srt_secd_round[1])); @63
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[1])); @64
+// &ConnRule(s/ex4_out/set0_half1_ex4/); @68
+// &ConnRule(s/_pipedown/_half_pipedown/); @69
+// &ConnRule(s/srt_ctrl/set0_half1_srt_ctrl/); @70
+// &ConnRule(s/vfdsu_ex2_/dp_set0_half1_ex2_/); @71
+// &ConnRule(s/vfdsu_ex3_/dp_set0_half1_ex3_/); @72
+// &ConnRule(s/vfdsu_ex4_/dp_set0_half1_ex4_/); @73
+// &Instance("ct_vfdsu_half","x_ct_vfdsu_half1_set0"); @74
+// &Connect(.ex1_src0(ex1_src0[63:48])); @75
+// &Connect(.ex1_src1(ex1_src1[63:48])); @76
+// &Connect(.srt_secd_round(srt_secd_round[1])); @77
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[1])); @78
+// &ConnRule(s/ex4_out/set1_doub_ex4/); @81
+// &ConnRule(s/srt_ctrl/set1_doub_srt_ctrl/); @82
+// &ConnRule(s/vfdsu_ex2_/dp_set1_double_ex2_/); @83
+// &ConnRule(s/slice_x/slice_1/); @84
+// &ConnRule(s/vfdsu_ex3_/dp_set1_double_ex3_/); @85
+// &ConnRule(s/vfdsu_ex4_/dp_set1_double_ex4_/); @86
+// &Instance("ct_vfdsu_double","x_ct_vfdsu_double_set1"); @87
+// &Connect(.ex1_src0(ex1_src0[127:64])); @88
+// &Connect(.ex1_src1(ex1_src1[127:64])); @89
+// &Connect(.ex1_double(set1_ex1_double)); @90
+// &Connect(.srt_secd_round(srt_secd_round[2])); @91
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[2])); @92
+// &ConnRule(s/ex4_out/set1_half0_ex4/); @95
+// &ConnRule(s/_pipedown/_half_pipedown/); @96
+// &ConnRule(s/srt_ctrl/set1_half0_srt_ctrl/); @97
+// &ConnRule(s/vfdsu_ex2_/dp_set1_half0_ex2_/); @98
+// &ConnRule(s/vfdsu_ex3_/dp_set1_half0_ex3_/); @99
+// &ConnRule(s/vfdsu_ex4_/dp_set1_half0_ex4_/); @100
+// &Instance("ct_vfdsu_half","x_ct_vfdsu_half0_set1"); @101
+// &Connect(.ex1_src0(ex1_src0[95:80])); @102
+// &Connect(.ex1_src1(ex1_src1[95:80])); @103
+// &Connect(.srt_secd_round(srt_secd_round[3])); @104
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[3])); @105
+// &ConnRule(s/ex4_out/set1_sing_ex4/); @108
+// &ConnRule(s/_pipedown/_sing_pipedown/); @109
+// &ConnRule(s/srt_ctrl/set1_sing_srt_ctrl/); @110
+// &ConnRule(s/slice_x/slice_1/); @111
+// &Instance("ct_vfdsu_single","x_ct_vfdsu_single_set1"); @112
+// &Connect(.ex1_src0(ex1_src0[127:96])); @113
+// &Connect(.ex1_src1(ex1_src1[127:96])); @114
+// &Connect(.srt_secd_round(srt_secd_round[3])); @115
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[3])); @116
+// &ConnRule(s/ex4_out/set1_half1_ex4/); @119
+// &ConnRule(s/_pipedown/_half_pipedown/); @120
+// &ConnRule(s/srt_ctrl/set1_half1_srt_ctrl/); @121
+// &ConnRule(s/vfdsu_ex2_/dp_set1_half1_ex2_/); @122
+// &ConnRule(s/vfdsu_ex3_/dp_set1_half1_ex3_/); @123
+// &ConnRule(s/vfdsu_ex4_/dp_set1_half1_ex4_/); @124
+// &Instance("ct_vfdsu_half","x_ct_vfdsu_half1_set1"); @125
+// &Connect(.ex1_src0(ex1_src0[127:112])); @126
+// &Connect(.ex1_src1(ex1_src1[127:112])); @127
+// &Connect(.srt_secd_round(srt_secd_round[3])); @128
+// &Connect(.ex2_srt_first_round(ex2_srt_first_round[3])); @129
+// &Instance("ct_vfdsu_ctrl"); @132
+ct_vfdsu_ctrl  x_ct_vfdsu_ctrl (
+  .cp0_vfpu_icg_en             (cp0_vfpu_icg_en            ),
+  .cp0_yy_clk_en               (cp0_yy_clk_en              ),
+  .cpurst_b                    (cpurst_b                   ),
+  .dp_vfdsu_ex1_pipex_sel      (dp_vfdsu_ex1_pipex_sel     ),
+  .dp_vfdsu_fdiv_gateclk_issue (dp_vfdsu_fdiv_gateclk_issue),
+  .dp_vfdsu_idu_fdiv_issue     (dp_vfdsu_idu_fdiv_issue    ),
+  .ex1_data_clk                (ex1_data_clk               ),
+  .ex1_double                  (ex1_double                 ),
+  .ex1_pipedown                (ex1_pipedown               ),
+  .ex1_single                  (ex1_single                 ),
+  .ex2_data_clk                (ex2_data_clk               ),
+  .ex2_pipedown                (ex2_pipedown               ),
+  .ex2_srt_first_round         (ex2_srt_first_round        ),
+  .ex3_data_clk                (ex3_data_clk               ),
+  .ex3_pipedown                (ex3_pipedown               ),
+  .forever_cpuclk              (forever_cpuclk             ),
+  .pad_yy_icg_scan_en          (pad_yy_icg_scan_en         ),
+  .pipex_dp_vfdsu_inst_vld     (pipex_dp_vfdsu_inst_vld    ),
+  .rtu_yy_xx_flush             (rtu_yy_xx_flush            ),
+  .srt_ctrl_rem_zero           (srt_ctrl_rem_zero          ),
+  .srt_ctrl_skip_srt           (srt_ctrl_skip_srt          ),
+  .srt_secd_round              (srt_secd_round             ),
+  .srt_sm_on                   (srt_sm_on                  ),
+  .vfdsu_dp_fdiv_busy          (vfdsu_dp_fdiv_busy         ),
+  .vfdsu_dp_inst_wb_req        (vfdsu_dp_inst_wb_req       ),
+  .vfdsu_ex2_double            (vfdsu_ex2_double           ),
+  .vfdsu_ex2_single            (vfdsu_ex2_single           ),
+  .vfdsu_ifu_debug_ex2_wait    (vfdsu_ifu_debug_ex2_wait   ),
+  .vfdsu_ifu_debug_idle        (vfdsu_ifu_debug_idle       ),
+  .vfdsu_ifu_debug_pipe_busy   (vfdsu_ifu_debug_pipe_busy  )
+);
+
+// &Instance("ct_vfdsu_double"); @133
+ct_vfdsu_double  x_ct_vfdsu_double (
+  .cp0_vfpu_icg_en     (cp0_vfpu_icg_en    ),
+  .cp0_yy_clk_en       (cp0_yy_clk_en      ),
+  .cpurst_b            (cpurst_b           ),
+  .ex1_div             (ex1_div            ),
+  .ex1_double          (ex1_double         ),
+  .ex1_pipedown        (ex1_pipedown       ),
+  .ex1_scalar          (ex1_scalar         ),
+  .ex1_single          (ex1_single         ),
+  .ex1_sqrt            (ex1_sqrt           ),
+  .ex1_src0            (ex1_src0           ),
+  .ex1_src1            (ex1_src1           ),
+  .ex1_static_rm       (ex1_static_rm      ),
+  .ex2_pipedown        (ex2_pipedown       ),
+  .ex2_srt_first_round (ex2_srt_first_round),
+  .ex3_pipedown        (ex3_pipedown       ),
+  .ex4_out_expt        (ex4_out_expt       ),
+  .ex4_out_result      (ex4_out_result     ),
+  .forever_cpuclk      (forever_cpuclk     ),
+  .pad_yy_icg_scan_en  (pad_yy_icg_scan_en ),
+  .srt_ctrl_rem_zero   (srt_ctrl_rem_zero  ),
+  .srt_ctrl_skip_srt   (srt_ctrl_skip_srt  ),
+  .srt_secd_round      (srt_secd_round     ),
+  .srt_sm_on           (srt_sm_on          ),
+  .vfpu_yy_xx_dqnan    (vfpu_yy_xx_dqnan   ),
+  .vfpu_yy_xx_rm       (vfpu_yy_xx_rm      )
+);
+
+// &Instance("ct_vfdsu_scalar_dp"); @134
+ct_vfdsu_scalar_dp  x_ct_vfdsu_scalar_dp (
+  .cp0_vfpu_icg_en               (cp0_vfpu_icg_en              ),
+  .cp0_yy_clk_en                 (cp0_yy_clk_en                ),
+  .cpurst_b                      (cpurst_b                     ),
+  .dp_vfdsu_ex1_pipex_dst_ereg   (dp_vfdsu_ex1_pipex_dst_ereg  ),
+  .dp_vfdsu_ex1_pipex_dst_vreg   (dp_vfdsu_ex1_pipex_dst_vreg  ),
+  .dp_vfdsu_ex1_pipex_iid        (dp_vfdsu_ex1_pipex_iid       ),
+  .dp_vfdsu_ex1_pipex_imm0       (dp_vfdsu_ex1_pipex_imm0      ),
+  .dp_vfdsu_ex1_pipex_srcf0      (dp_vfdsu_ex1_pipex_srcf0     ),
+  .dp_vfdsu_ex1_pipex_srcf1      (dp_vfdsu_ex1_pipex_srcf1     ),
+  .ex1_data_clk                  (ex1_data_clk                 ),
+  .ex1_div                       (ex1_div                      ),
+  .ex1_double                    (ex1_double                   ),
+  .ex1_pipedown                  (ex1_pipedown                 ),
+  .ex1_scalar                    (ex1_scalar                   ),
+  .ex1_single                    (ex1_single                   ),
+  .ex1_sqrt                      (ex1_sqrt                     ),
+  .ex1_src0                      (ex1_src0                     ),
+  .ex1_src1                      (ex1_src1                     ),
+  .ex1_static_rm                 (ex1_static_rm                ),
+  .ex2_data_clk                  (ex2_data_clk                 ),
+  .ex2_pipedown                  (ex2_pipedown                 ),
+  .ex3_data_clk                  (ex3_data_clk                 ),
+  .ex3_pipedown                  (ex3_pipedown                 ),
+  .ex4_out_expt                  (ex4_out_expt                 ),
+  .ex4_out_result                (ex4_out_result               ),
+  .forever_cpuclk                (forever_cpuclk               ),
+  .idu_vfpu_rf_pipex_func        (idu_vfpu_rf_pipex_func       ),
+  .idu_vfpu_rf_pipex_gateclk_sel (idu_vfpu_rf_pipex_gateclk_sel),
+  .pad_yy_icg_scan_en            (pad_yy_icg_scan_en           ),
+  .pipex_dp_vfdsu_ereg           (pipex_dp_vfdsu_ereg          ),
+  .pipex_dp_vfdsu_ereg_data      (pipex_dp_vfdsu_ereg_data     ),
+  .pipex_dp_vfdsu_freg_data      (pipex_dp_vfdsu_freg_data     ),
+  .pipex_dp_vfdsu_vreg           (pipex_dp_vfdsu_vreg          ),
+  .vfdsu_ex2_double              (vfdsu_ex2_double             ),
+  .vfdsu_ex2_single              (vfdsu_ex2_single             )
+);
+
+
+// &ModuleEnd; @137
+endmodule
+
+
diff --git a/vendor/openc910/LICENSE b/vendor/openc910/LICENSE
new file mode 100644
index 00000000..261eeb9e
--- /dev/null
+++ b/vendor/openc910/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vendor/openc910/README.md b/vendor/openc910/README.md
new file mode 100644
index 00000000..c4febe77
--- /dev/null
+++ b/vendor/openc910/README.md
@@ -0,0 +1,74 @@
+# IP Readme
+
+  Welcome to C910! Some key directories are shown below.
+```
+|--C910_RTL_FACTORY/
+  |--gen_rtl/     ## Verilog source code of C910
+  |--setup/       ## Script to set the environment variables
+|--smart_run/     ## RTL simulation environment
+  |--impl/        ## SDC file, scripts and file lists for implementation
+  |--logical/     ## SoC demo and test bench to run the simulation
+  |--setup/       ## GNU tool chain setting
+  |--tests/       ## Test driver and test cases
+  |--work/        ## Working directory for builds
+  |--Makefile     ## Makefile for building and running sim targets
+|--doc/           ## The user and integration manual of C910
+```
+
+
+## Usage
+
+  Step1: Get Started
+
+```
+$ cd C910_RTL_FACTORY
+$ source setup/setup.csh
+$ cd ../smart_run
+$ make help
+To gain more information about how to use smart testbench.
+```
+
+  Step2: Download and install C/C++ Compiler
+
+```
+You can download the GNU tool chain compiled by T-HEAD from the url below:
+https://occ.t-head.cn/community/download?id=3948120165480468480
+
+$ cd ./smart_run
+GNU tool chain (specific riscv version) must be installed and specified before
+compiling *.c/*.v tests of the smart environment. Please refer to the following
+setup file about how to specify it:
+    ./smart_run/setup/example_setup.csh
+```
+
+
+## Notes
+
+```
+The testbench supports Verilator(version is better newer than 4.215),iverilog, vcs and irun to run simulation and you can use Gtkwave or verdi
+to open the waveform under ./smart_run/work/ directory.
+
+You can get the debugger, IDE and SDK from the url:https://occ.t-head.cn/community/download?id=575997419775328256
+```
+
+
+## Discussion
+    If you are interested in participating in discussions or improving the "openXuantie" cores, you can scan the DingDing QR code below to join the discussion group.
+<img src="https://github.com/T-head-Semi/opene902/blob/main/doc/QR_code_openXuantie.png" />
+
+
+/*Copyright 2019-2021 T-Head Semiconductor Co., Ltd.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+*/

From 97bfe0095c4e0cb8b6fd117987f04b11ff56eb1b Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Fri, 7 Jun 2024 14:43:10 +0200
Subject: [PATCH 3/8] Fix synchronization of THMULTI DivSqrt lanes when
 FP16ALT, FP8, or FP8ALT are enabled (#9)

* Fix synchronization of THMULTI DivSqrt lanes when FP16ALT, FP8 or FP8ALT are enabled

* Update CHANGELOG-PULP.md
---
 docs/CHANGELOG-PULP.md              | 5 +++++
 src/fpnew_opgroup_multifmt_slice.sv | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
index cd09eda5..44e1432c 100644
--- a/docs/CHANGELOG-PULP.md
+++ b/docs/CHANGELOG-PULP.md
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
 Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
 
+## [pulp-v0.2.1] - 2024-06-07
+
+### Fix
+- Fix synchronization of THMULTI DivSqrt lanes when FP16ALT, FP8, or FP8ALT are enabled.
+
 ## [pulp-v0.2.0] - 2024-05-29
 
 ### Added
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index 6b5545c5..ff6f1a14 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -628,10 +628,10 @@ or on 16b inputs producing 32b outputs");
 
   if ((DivSqrtSel != fpnew_pkg::TH32) && (OpGroup == fpnew_pkg::DIVSQRT)) begin
     // Synch lanes if there is more than one
-    assign simd_synch_rdy  = EnableVectors ? &divsqrt_ready : divsqrt_ready[0];
-    assign simd_synch_done = EnableVectors ? &divsqrt_done  : divsqrt_done[0];
+    assign simd_synch_rdy  = EnableVectors ? &divsqrt_ready[NUM_DIVSQRT_LANES-1:0] : divsqrt_ready[0];
+    assign simd_synch_done = EnableVectors ? &divsqrt_done[NUM_DIVSQRT_LANES-1:0]  : divsqrt_done[0];
   end else begin
-    // Unused (alternative divider only supported for scalar FP32 divsqrt)
+    // Unused (TH32 divider only supported for scalar FP32 divsqrt)
     assign simd_synch_rdy  = '0;
     assign simd_synch_done = '0;
   end

From d30aecff395618cb0608ca8896e19425b77a3420 Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Wed, 26 Jun 2024 12:26:43 +0200
Subject: [PATCH 4/8] Add FP16ALT support to THMULTI DivSqrt (#12)

* Add FP16ALT support to THMULTI DivSqrt
---
 docs/CHANGELOG-PULP.md                        |    5 +
 docs/README.md                                |    2 +-
 src/fpnew_divsqrt_th_64_multi.sv              |   39 +-
 src/fpnew_opgroup_multifmt_slice.sv           |    4 +-
 src/fpnew_pkg.sv                              |    4 +-
 vendor/openc910.vendor.hjson                  |    2 +
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v         |   21 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_double.v       |   29 +
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v         |   64 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v      |   96 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_round.v        |  151 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v    |   24 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v          |   86 +-
 .../gen_rtl/vfdsu/rtl/ct_vfdsu_top.v          |   16 +-
 ...6ALT-support-to-THMULTI-DivSqrt-unit.patch | 1359 +++++++++++++++++
 15 files changed, 1811 insertions(+), 91 deletions(-)
 create mode 100644 vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch

diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
index 44e1432c..94d245be 100644
--- a/docs/CHANGELOG-PULP.md
+++ b/docs/CHANGELOG-PULP.md
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
 Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
 
+## [pulp-v0.2.2] - 2024-06-24
+
+### Added
+- Add FP16ALT support to THMULTI DivSqrt
+
 ## [pulp-v0.2.1] - 2024-06-07
 
 ### Fix
diff --git a/docs/README.md b/docs/README.md
index dd8a0e9b..f00fb3b5 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -366,7 +366,7 @@ It is of type `divsqrt_unit_t`, which is defined as:
 typedef enum logic[1:0] {
   PULP,    // "PULP" instantiates the PULP DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8 and SIMD operations
   TH32,    // "TH32" instantiates the E906 DivSqrt unit supports only FP32 (no SIMD support)
-  THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16 and SIMD operations
+  THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16, FP16ALT and SIMD operations
 } divsqrt_unit_t;
 ```
 
diff --git a/src/fpnew_divsqrt_th_64_multi.sv b/src/fpnew_divsqrt_th_64_multi.sv
index eff0620d..a15878af 100644
--- a/src/fpnew_divsqrt_th_64_multi.sv
+++ b/src/fpnew_divsqrt_th_64_multi.sv
@@ -144,31 +144,34 @@ module fpnew_divsqrt_th_64_multi #(
   // -----------------
   // Input processing
   // -----------------
-  logic [1:0] divsqrt_fmt;
+  logic [3:0] divsqrt_fmt;
 
   // Translate fpnew formats into divsqrt formats
   if(WIDTH == 64) begin : translate_fmt_64_bits
     always_comb begin : translate_fmt
       unique case (dst_fmt_q)
-        fpnew_pkg::FP64:    divsqrt_fmt = 2'b10;
-        fpnew_pkg::FP32:    divsqrt_fmt = 2'b01;
-        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
-        default:            divsqrt_fmt = 2'b10; // 64 bit max width
+        fpnew_pkg::FP64:    divsqrt_fmt = 4'b1000;
+        fpnew_pkg::FP32:    divsqrt_fmt = 4'b0100;
+        fpnew_pkg::FP16:    divsqrt_fmt = 4'b0010;
+        fpnew_pkg::FP16ALT: divsqrt_fmt = 4'b0001;
+        default:            divsqrt_fmt = 4'b1000; // 64 bit max width
       endcase
     end
   end else if(WIDTH == 32) begin : translate_fmt_32_bits
     always_comb begin : translate_fmt
       unique case (dst_fmt_q)
-        fpnew_pkg::FP32:    divsqrt_fmt = 2'b01;
-        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
-        default:            divsqrt_fmt = 2'b01; // 32 bit max width
+        fpnew_pkg::FP32:    divsqrt_fmt = 4'b0100;
+        fpnew_pkg::FP16:    divsqrt_fmt = 4'b0010;
+        fpnew_pkg::FP16ALT: divsqrt_fmt = 4'b0001;
+        default:            divsqrt_fmt = 4'b0100; // 32 bit max width
       endcase
     end
   end else if(WIDTH == 16) begin : translate_fmt_16_bits
     always_comb begin : translate_fmt
       unique case (dst_fmt_q)
-        fpnew_pkg::FP16:    divsqrt_fmt = 2'b00;
-        default:            divsqrt_fmt = 2'b00; // 16 bit max width
+        fpnew_pkg::FP16:    divsqrt_fmt = 4'b0010;
+        fpnew_pkg::FP16ALT: divsqrt_fmt = 4'b0001;
+        default:            divsqrt_fmt = 4'b0010; // 16 bit max width
       endcase
     end
   end else begin
@@ -298,7 +301,7 @@ module fpnew_divsqrt_th_64_multi #(
   
   // Regs to save current instruction
   fpnew_pkg::roundmode_e rm_q;
-  logic[1:0] divsqrt_fmt_q;
+  logic[3:0] divsqrt_fmt_q;
   fpnew_pkg::operation_e divsqrt_op_q;
   logic div_op, sqrt_op;
   logic [WIDTH-1:0] srcf0_q, srcf1_q;
@@ -314,15 +317,15 @@ module fpnew_divsqrt_th_64_multi #(
   // NaN-box inputs with max WIDTH
   if(WIDTH == 64) begin : gen_fmt_64_bits
     always_comb begin : NaN_box_inputs
-      if(divsqrt_fmt_q == 2'b10) begin // 64-bit
+      if(divsqrt_fmt_q == 4'b1000) begin // 64-bit
         srcf0[63:0] = srcf0_q[63:0];
         srcf1[63:0] = srcf1_q[63:0];
-      end else if(divsqrt_fmt_q == 2'b01) begin // 32-bit
+      end else if(divsqrt_fmt_q == 4'b0100) begin // 32-bit
         srcf0[63:32] = '1;
         srcf1[63:32] = '1;
         srcf0[31:0] = srcf0_q[31:0];
         srcf1[31:0] = srcf1_q[31:0];
-      end else if(divsqrt_fmt_q == 2'b00) begin //16-bit
+      end else if((divsqrt_fmt_q == 4'b0010) || (divsqrt_fmt_q == 4'b0001)) begin //16-bit
         srcf0[63:16] = '1;
         srcf1[63:16] = '1;
         srcf0[15:0] = srcf0_q[15:0];
@@ -334,12 +337,12 @@ module fpnew_divsqrt_th_64_multi #(
     end
   end else if (WIDTH == 32) begin : gen_fmt_32_bits
     always_comb begin : NaN_box_inputs
-      if(divsqrt_fmt_q == 2'b01) begin // 32-bit
+      if(divsqrt_fmt_q == 4'b0100) begin // 32-bit
         srcf0[63:32] = '1;
         srcf1[63:32] = '1;
         srcf0[31:0] = srcf0_q[31:0];
         srcf1[31:0] = srcf1_q[31:0];
-      end else if(divsqrt_fmt_q == 2'b00) begin // 16-bit
+      end else if((divsqrt_fmt_q == 4'b0010) || (divsqrt_fmt_q == 4'b0001)) begin // 16-bit
         srcf0[63:16] = '1;
         srcf1[63:16] = '1;
         srcf0[15:0] = srcf0_q[15:0];
@@ -351,7 +354,7 @@ module fpnew_divsqrt_th_64_multi #(
     end
   end else if (WIDTH == 16) begin : gen_fmt_16_bits
     always_comb begin : NaN_box_inputs
-      if(divsqrt_fmt_q == 2'b00) begin // 16-bit
+      if((divsqrt_fmt_q == 4'b0010) || (divsqrt_fmt_q == 4'b0001)) begin // 16-bit
         srcf0[63:16] = '1;
         srcf1[63:16] = '1;
         srcf0[15:0] = srcf0_q[15:0];
@@ -390,7 +393,7 @@ module fpnew_divsqrt_th_64_multi #(
     .dp_vfdsu_fdiv_gateclk_issue    ( 1'b1                      ), // Local clock enable (same as above)
     .dp_vfdsu_idu_fdiv_issue        ( op_starting               ), // 1. Issue fdiv (FSM in ctrl)
     .forever_cpuclk                 ( clk_i                     ), // Clock input
-    .idu_vfpu_rf_pipex_func         ( {3'b0, divsqrt_fmt_q, 13'b0 ,sqrt_op, div_op} ), // Defines format (bits 16,15) and operation (bits 1,0)
+    .idu_vfpu_rf_pipex_func         ( {3'b0, divsqrt_fmt_q, 11'b0 ,sqrt_op, div_op} ), // Defines format (bits 16,15) and operation (bits 1,0)
     .idu_vfpu_rf_pipex_gateclk_sel  ( func_sel                  ), // 2. Select func
     .pad_yy_icg_scan_en             ( 1'b0                      ), // SE signal for the redundant clock gating module
     .rtu_yy_xx_flush                ( flush_i                   ), // Flush
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index ff6f1a14..f5991cbd 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -68,9 +68,9 @@ module fpnew_opgroup_multifmt_slice #(
     if ((DivSqrtSel == fpnew_pkg::TH32) && !((FpFmtConfig[0] == 1) && (FpFmtConfig[1:NUM_FORMATS-1] == '0))) begin
       $fatal(1, "T-Head-based DivSqrt unit supported only in FP32-only configurations. \
 Set DivSqrtSel = THMULTI or DivSqrtSel = PULP to use a multi-format divider");
-    end else if ((DivSqrtSel == fpnew_pkg::THMULTI) && (FpFmtConfig[3] == 1'b1 || FpFmtConfig[4] == 1'b1 || FpFmtConfig[5] == 1'b1)) begin
+    end else if ((DivSqrtSel == fpnew_pkg::THMULTI) && (FpFmtConfig[3] == 1'b1 || FpFmtConfig[5] == 1'b1)) begin
       $warning("The DivSqrt unit of C910 (instantiated by DivSqrtSel = THMULTI) does not support \
-FP16alt, FP8, FP8alt. Please use the PULP DivSqrt unit when in need of div/sqrt operations on FP16alt, FP8, FP8alt.");
+FP8, FP8alt. Please use the PULP DivSqrt unit when in need of div/sqrt operations on FP8, FP8alt.");
     end
   end
 
diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv
index 42d0df6b..1e8ce099 100644
--- a/src/fpnew_pkg.sv
+++ b/src/fpnew_pkg.sv
@@ -136,7 +136,7 @@ package fpnew_pkg;
   typedef enum logic[1:0] {
     PULP,    // "PULP" instantiates the PULP DivSqrt unit supports FP64, FP32, FP16, FP16ALT, FP8 and SIMD operations
     TH32,    // "TH32" instantiates the E906 DivSqrt unit supports only FP32 (no SIMD support)
-    THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16 and SIMD operations
+    THMULTI  // "THMULTI" instantiates the C910 DivSqrt unit supports FP64, FP32, FP16, FP16ALT and SIMD operations
   } divsqrt_unit_t;
 
   // -------------------
@@ -454,7 +454,7 @@ package fpnew_pkg;
     // Returns the maximum number of lanes in the FPU according to width, format config and vectors
   function automatic int unsigned num_divsqrt_lanes(int unsigned width, fmt_logic_t cfg, logic vec, divsqrt_unit_t DivSqrtSel);
     automatic fmt_logic_t cfg_tmp;
-    cfg_tmp = (DivSqrtSel == THMULTI) ? cfg & 6'b111000 : cfg;
+    cfg_tmp = (DivSqrtSel == THMULTI) ? cfg & 6'b111010 : cfg;
     return vec ? width / min_fp_width(cfg_tmp) : 1; // if no vectors, only one lane
   endfunction
 
diff --git a/vendor/openc910.vendor.hjson b/vendor/openc910.vendor.hjson
index ddaa644f..356121b0 100644
--- a/vendor/openc910.vendor.hjson
+++ b/vendor/openc910.vendor.hjson
@@ -10,6 +10,8 @@
     rev: "e0c4ad8ec7f8c70f649d826ebd6c949086453272"
   }
 
+  patch_dir: "patches/openc910"
+
   exclude_from_upstream: [
     "doc",
     "smart_run",
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
index f7f541f2..0aba4f1c 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
@@ -26,6 +26,8 @@ module ct_vfdsu_ctrl(
   ex1_double,
   ex1_pipedown,
   ex1_single,
+  ex1_half,
+  ex1_bfloat,
   ex2_data_clk,
   ex2_pipedown,
   ex2_srt_first_round,
@@ -43,6 +45,8 @@ module ct_vfdsu_ctrl(
   vfdsu_dp_inst_wb_req,
   vfdsu_ex2_double,
   vfdsu_ex2_single,
+  vfdsu_ex2_half,
+  vfdsu_ex2_bfloat,
   vfdsu_ifu_debug_ex2_wait,
   vfdsu_ifu_debug_idle,
   vfdsu_ifu_debug_pipe_busy
@@ -57,6 +61,8 @@ input          dp_vfdsu_fdiv_gateclk_issue;
 input          dp_vfdsu_idu_fdiv_issue;    
 input          ex1_double;                 
 input          ex1_single;                 
+input          ex1_half;
+input          ex1_bfloat;
 input          forever_cpuclk;             
 input          pad_yy_icg_scan_en;         
 input          rtu_yy_xx_flush;            
@@ -64,6 +70,8 @@ input          srt_ctrl_rem_zero;
 input          srt_ctrl_skip_srt;          
 input          vfdsu_ex2_double;           
 input          vfdsu_ex2_single;           
+input          vfdsu_ex2_half;
+input          vfdsu_ex2_bfloat;
 output         ex1_data_clk;               
 output         ex1_pipedown;               
 output         ex2_data_clk;               
@@ -106,6 +114,8 @@ wire           ex1_data_clk_en;
 wire           ex1_double;                 
 wire           ex1_pipedown;               
 wire           ex1_single;                 
+wire           ex1_half;
+wire           ex1_bfloat;
 wire           ex2_data_clk;               
 wire           ex2_data_clk_en;            
 wire           ex2_pipe_clk;               
@@ -137,6 +147,8 @@ wire           vfdsu_dp_fdiv_busy;
 wire           vfdsu_dp_inst_wb_req;       
 wire           vfdsu_ex2_double;           
 wire           vfdsu_ex2_single;           
+wire           vfdsu_ex2_half;
+wire           vfdsu_ex2_bfloat;
 wire           vfdsu_ex2_vld;              
 wire           vfdsu_ifu_debug_ex2_wait;   
 wire           vfdsu_ifu_debug_idle;       
@@ -244,8 +256,9 @@ end
 //For Double, initial is 5'b11100('d28), calculate 29 round
 //For Single, initial is 5'b01110('d14), calculate 15 round
 assign srt_cnt_ini[4:0] = (ex1_double) ? 5'b01101 :
-                           ex1_single  ? 5'b00110
-                                       : 5'b00011;
+                          (ex1_single) ? 5'b00110 :
+                          (ex1_half)   ? 5'b00011
+                                       : 5'b00010;
 
 //vfdsu ex2 pipedown signal
 assign ex2_pipedown = srt_last_round && div_st_ex2;
@@ -277,7 +290,9 @@ assign srt_secd_round  = ex2_srt_secd_round;
 
 assign ex2_srt_secd_round_pre  = srt_sm_on && srt_secd_round_pre;
 assign srt_secd_round_pre      = vfdsu_ex2_double ? srt_cnt[4:0]==5'b01101 : 
-                                 vfdsu_ex2_single ? srt_cnt[4:0]==5'b00110 : srt_cnt[4:0] == 5'b00011;
+                                 vfdsu_ex2_single ? srt_cnt[4:0]==5'b00110 :
+                                 vfdsu_ex2_half   ? srt_cnt[4:0]==5'b00011
+                                                  : srt_cnt[4:0]==5'b00010;
 
 //==========================================================
 //              EX3 Stage Control Signal
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
index b57e289e..ccd34f9c 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
@@ -24,6 +24,8 @@ module ct_vfdsu_double(
   ex1_pipedown,
   ex1_scalar,
   ex1_single,
+  ex1_half,
+  ex1_bfloat,
   ex1_sqrt,
   ex1_src0,
   ex1_src1,
@@ -52,6 +54,8 @@ input           ex1_double;
 input           ex1_pipedown;                         
 input           ex1_scalar;                           
 input           ex1_single;                           
+input           ex1_half;
+input           ex1_bfloat;
 input           ex1_sqrt;                             
 input   [63:0]  ex1_src0;                             
 input   [63:0]  ex1_src1;                             
@@ -83,6 +87,8 @@ wire            ex1_pipedown;
 wire    [59:0]  ex1_remainder;                        
 wire            ex1_scalar;                           
 wire            ex1_single;                           
+wire            ex1_half;
+wire            ex1_bfloat;
 wire            ex1_sqrt;                             
 wire    [63:0]  ex1_src0;                             
 wire    [63:0]  ex1_src1;                             
@@ -116,12 +122,15 @@ wire            vfdsu_ex2_result_sign;
 wire            vfdsu_ex2_result_zero;                
 wire    [2 :0]  vfdsu_ex2_rm;                         
 wire            vfdsu_ex2_single;                     
+wire            vfdsu_ex2_half;
+wire            vfdsu_ex2_bfloat;
 wire            vfdsu_ex2_sqrt;                       
 wire            vfdsu_ex2_srt_skip;                   
 wire    [12:0]  vfdsu_ex3_doub_expnt_rst;             
 wire            vfdsu_ex3_double;                     
 wire            vfdsu_ex3_dz;                         
 wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
+wire    [12:0]  vfdsu_ex3_bfloat_expnt_rst;
 wire            vfdsu_ex3_id_srt_skip;                
 wire            vfdsu_ex3_nv;                         
 wire            vfdsu_ex3_of;                         
@@ -141,6 +150,8 @@ wire    [2 :0]  vfdsu_ex3_rm;
 wire            vfdsu_ex3_rslt_denorm;                
 wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
 wire            vfdsu_ex3_single;                     
+wire            vfdsu_ex3_half;
+wire            vfdsu_ex3_bfloat;
 wire            vfdsu_ex3_uf;                         
 wire            vfdsu_ex4_denorm_to_tiny_frac;        
 wire            vfdsu_ex4_double;                     
@@ -164,6 +175,8 @@ wire            vfdsu_ex4_result_sign;
 wire            vfdsu_ex4_result_zero;                
 wire            vfdsu_ex4_rslt_denorm;                
 wire            vfdsu_ex4_single;                     
+wire            vfdsu_ex4_half;
+wire            vfdsu_ex4_bfloat;
 wire            vfdsu_ex4_uf;                         
 wire            vfpu_yy_xx_dqnan;                     
 wire    [2 :0]  vfpu_yy_xx_rm;                        
@@ -181,6 +194,8 @@ ct_vfdsu_prepare  x_ct_vfdsu_prepare (
   .ex1_remainder         (ex1_remainder        ),
   .ex1_scalar            (ex1_scalar           ),
   .ex1_single            (ex1_single           ),
+  .ex1_half              (ex1_half             ),
+  .ex1_bfloat            (ex1_bfloat           ),
   .ex1_sqrt              (ex1_sqrt             ),
   .ex1_src0              (ex1_src0             ),
   .ex1_src1              (ex1_src1             ),
@@ -204,6 +219,8 @@ ct_vfdsu_prepare  x_ct_vfdsu_prepare (
   .vfdsu_ex2_result_zero (vfdsu_ex2_result_zero),
   .vfdsu_ex2_rm          (vfdsu_ex2_rm         ),
   .vfdsu_ex2_single      (vfdsu_ex2_single     ),
+  .vfdsu_ex2_half        (vfdsu_ex2_half       ),
+  .vfdsu_ex2_bfloat      (vfdsu_ex2_bfloat     ),
   .vfdsu_ex2_sqrt        (vfdsu_ex2_sqrt       ),
   .vfdsu_ex2_srt_skip    (vfdsu_ex2_srt_skip   ),
   .vfpu_yy_xx_dqnan      (vfpu_yy_xx_dqnan     ),
@@ -246,12 +263,15 @@ ct_vfdsu_srt  x_ct_vfdsu_srt (
   .vfdsu_ex2_result_zero                 (vfdsu_ex2_result_zero                ),
   .vfdsu_ex2_rm                          (vfdsu_ex2_rm                         ),
   .vfdsu_ex2_single                      (vfdsu_ex2_single                     ),
+  .vfdsu_ex2_half                        (vfdsu_ex2_half                       ),
+  .vfdsu_ex2_bfloat                      (vfdsu_ex2_bfloat                     ),
   .vfdsu_ex2_sqrt                        (vfdsu_ex2_sqrt                       ),
   .vfdsu_ex2_srt_skip                    (vfdsu_ex2_srt_skip                   ),
   .vfdsu_ex3_doub_expnt_rst              (vfdsu_ex3_doub_expnt_rst             ),
   .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
   .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
   .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
+  .vfdsu_ex3_bfloat_expnt_rst            (vfdsu_ex3_bfloat_expnt_rst           ),
   .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
   .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
   .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
@@ -271,6 +291,8 @@ ct_vfdsu_srt  x_ct_vfdsu_srt (
   .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
   .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
   .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
+  .vfdsu_ex3_half                        (vfdsu_ex3_half                       ),
+  .vfdsu_ex3_bfloat                      (vfdsu_ex3_bfloat                     ),
   .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         )
 );
 
@@ -288,6 +310,7 @@ ct_vfdsu_round  x_ct_vfdsu_round (
   .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
   .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
   .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
+  .vfdsu_ex3_bfloat_expnt_rst            (vfdsu_ex3_bfloat_expnt_rst           ),
   .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
   .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
   .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
@@ -307,6 +330,8 @@ ct_vfdsu_round  x_ct_vfdsu_round (
   .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
   .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
   .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
+  .vfdsu_ex3_half                        (vfdsu_ex3_half                       ),
+  .vfdsu_ex3_bfloat                      (vfdsu_ex3_bfloat                     ),
   .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         ),
   .vfdsu_ex4_denorm_to_tiny_frac         (vfdsu_ex4_denorm_to_tiny_frac        ),
   .vfdsu_ex4_double                      (vfdsu_ex4_double                     ),
@@ -330,6 +355,8 @@ ct_vfdsu_round  x_ct_vfdsu_round (
   .vfdsu_ex4_result_zero                 (vfdsu_ex4_result_zero                ),
   .vfdsu_ex4_rslt_denorm                 (vfdsu_ex4_rslt_denorm                ),
   .vfdsu_ex4_single                      (vfdsu_ex4_single                     ),
+  .vfdsu_ex4_half                        (vfdsu_ex4_half                       ),
+  .vfdsu_ex4_bfloat                      (vfdsu_ex4_bfloat                     ),
   .vfdsu_ex4_uf                          (vfdsu_ex4_uf                         )
 );
 
@@ -359,6 +386,8 @@ ct_vfdsu_pack  x_ct_vfdsu_pack (
   .vfdsu_ex4_result_zero         (vfdsu_ex4_result_zero        ),
   .vfdsu_ex4_rslt_denorm         (vfdsu_ex4_rslt_denorm        ),
   .vfdsu_ex4_single              (vfdsu_ex4_single             ),
+  .vfdsu_ex4_half                (vfdsu_ex4_half               ),
+  .vfdsu_ex4_bfloat              (vfdsu_ex4_bfloat             ),
   .vfdsu_ex4_uf                  (vfdsu_ex4_uf                 )
 );
 
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
index e1d2e18a..681b77aa 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
@@ -39,6 +39,8 @@ module ct_vfdsu_pack(
   vfdsu_ex4_result_zero,
   vfdsu_ex4_rslt_denorm,
   vfdsu_ex4_single,
+  vfdsu_ex4_half,
+  vfdsu_ex4_bfloat,
   vfdsu_ex4_uf
 );
 
@@ -65,6 +67,8 @@ input           vfdsu_ex4_result_sign;
 input           vfdsu_ex4_result_zero;        
 input           vfdsu_ex4_rslt_denorm;        
 input           vfdsu_ex4_single;             
+input           vfdsu_ex4_half;
+input           vfdsu_ex4_bfloat;
 input           vfdsu_ex4_uf;                 
 output  [4 :0]  ex4_out_expt;                 
 output  [63:0]  ex4_out_result;               
@@ -73,6 +77,7 @@ output  [63:0]  ex4_out_result;
 reg     [51:0]  ex4_denorm_frac;              
 reg     [51:0]  ex4_frac_52;                  
 reg     [51:0]  ex4_half_denorm_frac;         
+reg     [51:0]  ex4_bfloat_denorm_frac;
 reg     [63:0]  ex4_out_result;               
 reg     [51:0]  ex4_single_denorm_frac;       
 reg     [12:0]  expnt_add_op1;                
@@ -95,6 +100,11 @@ wire    [63:0]  ex4_half_rst0;
 wire    [63:0]  ex4_half_rst_inf;             
 wire    [63:0]  ex4_half_rst_norm;            
 wire    [63:0]  ex4_half_rst_qnan;            
+wire    [63:0]  ex4_bfloat_lfn;
+wire    [63:0]  ex4_bfloat_rst0;
+wire    [63:0]  ex4_bfloat_rst_inf;
+wire    [63:0]  ex4_bfloat_rst_norm;
+wire    [63:0]  ex4_bfloat_rst_qnan;
 wire            ex4_of_plus;                  
 wire    [4 :0]  ex4_out_expt;                 
 wire            ex4_result_inf;               
@@ -134,6 +144,8 @@ wire            vfdsu_ex4_result_sign;
 wire            vfdsu_ex4_result_zero;        
 wire            vfdsu_ex4_rslt_denorm;        
 wire            vfdsu_ex4_single;             
+wire            vfdsu_ex4_half;
+wire            vfdsu_ex4_bfloat;
 wire            vfdsu_ex4_uf;                 
 
 
@@ -277,6 +289,23 @@ endcase
 // &CombEnd; @147
 end
 
+always @( vfdsu_ex4_expnt_rst[12:0]
+       or ex4_frac[54:1]
+       or vfdsu_ex4_denorm_to_tiny_frac)
+begin
+case(vfdsu_ex4_expnt_rst[12:0])
+  13'h1:   ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[52:1]}; //-1022 1
+  13'h0:   ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[53:2]}; //-1023 0
+  13'h1fff:ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[54:3]}; //-1024 -1
+  13'h1ffe:ex4_bfloat_denorm_frac[51:0] = {1'b0, ex4_frac[54:4]}; //-1025 -2
+  13'h1ffd:ex4_bfloat_denorm_frac[51:0] = {2'b0, ex4_frac[54:5]}; //-1026 -3
+  13'h1ffc:ex4_bfloat_denorm_frac[51:0] = {3'b0, ex4_frac[54:6]}; //-1027 -4
+  13'h1ffb:ex4_bfloat_denorm_frac[51:0] = {4'b0, ex4_frac[54:7]}; //-1028 -5
+  13'h1ffa:ex4_bfloat_denorm_frac[51:0] = {5'b0, ex4_frac[54:8]}; //-1029 -6
+  default :ex4_bfloat_denorm_frac[51:0] = vfdsu_ex4_denorm_to_tiny_frac ?{7'b1,45'b0} : 52'b0; //-1045
+endcase
+end
+
 //here when denormal number round to add1, it will become normal number
 assign ex4_denorm_potnt_norm    = (vfdsu_ex4_potnt_norm[1] && ex4_frac[53]) || 
                                   (vfdsu_ex4_potnt_norm[0] && ex4_frac[54]) ;
@@ -286,9 +315,11 @@ assign ex4_rslt_denorm          = !vfdsu_ex4_result_qnan
 assign ex4_denorm_result[63:0]  = vfdsu_ex4_double ? 
                                   {vfdsu_ex4_result_sign,11'h0,ex4_denorm_frac[51:0]} :
                                   vfdsu_ex4_single ? {32'hffffffff,vfdsu_ex4_result_sign,
-                                        8'h0,ex4_single_denorm_frac[51:29]}  : {
-                                        48'hffffffffffff,vfdsu_ex4_result_sign,5'h0,
-                                        ex4_half_denorm_frac[51:42]};
+                                        8'h0,ex4_single_denorm_frac[51:29]}  :
+                                  vfdsu_ex4_half ? {48'hffffffffffff,vfdsu_ex4_result_sign,5'h0,
+                                        ex4_half_denorm_frac[51:42]}
+                                                 : {48'hffffffffffff,vfdsu_ex4_result_sign,8'h0,
+                                        ex4_bfloat_denorm_frac[51:45]};
 
                                
 
@@ -299,6 +330,15 @@ assign ex4_half_rst_norm[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,
                                   ex4_expnt_rst[4:0],
                                   ex4_frac_52[51:42]};
 assign ex4_half_rst0[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,15'h0};                                
+
+assign ex4_bfloat_lfn[63:0]      = {48'hffffffffffff,vfdsu_ex4_result_sign,8'hfe,{7{1'b1}}};
+assign ex4_bfloat_rst_qnan[63:0] = {48'hffffffffffff,vfdsu_ex4_qnan_sign, 8'hff,1'b1, vfdsu_ex4_qnan_f[5:0]};
+assign ex4_bfloat_rst_inf[63:0]  = {48'hffffffffffff,vfdsu_ex4_result_sign,8'hff,7'b0};
+assign ex4_bfloat_rst_norm[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,
+                                  ex4_expnt_rst[7:0],
+                                  ex4_frac_52[51:45]};
+assign ex4_bfloat_rst0[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,15'h0};
+
 //ex4 overflow/underflow plus                                 
 assign ex4_rst_nor = vfdsu_ex4_result_nor;                    
 assign ex4_of_plus = vfdsu_ex4_potnt_of  && 
@@ -345,21 +385,23 @@ assign ex4_sing_rst_norm[63:0] = {32'hffffffff,vfdsu_ex4_result_sign,
                                   ex4_expnt_rst[7:0],
                                   ex4_frac_52[51:29]};
 assign ex4_rst_lfn[63:0]       = (vfdsu_ex4_double) ? ex4_doub_lfn[63:0] :
-                                  vfdsu_ex4_single  ? ex4_sing_lfn[63:0] : ex4_half_lfn[63:0];
+                                  vfdsu_ex4_single  ? ex4_sing_lfn[63:0] :
+                                  vfdsu_ex4_half    ? ex4_half_lfn[63:0] : ex4_bfloat_lfn[63:0];
 
 assign ex4_rst0[63:0]          = (vfdsu_ex4_double) ? ex4_doub_rst0[63:0] :
-                                  vfdsu_ex4_single  ? ex4_sing_rst0[63:0] : ex4_half_rst0[63:0];
+                                  vfdsu_ex4_single  ? ex4_sing_rst0[63:0] :
+                                  vfdsu_ex4_half    ? ex4_half_rst0[63:0] : ex4_bfloat_rst0[63:0];
 
 assign ex4_rst_qnan[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_qnan[63:0] :
-                                  vfdsu_ex4_single  ? ex4_sing_rst_qnan[63:0] 
-                                                    : ex4_half_rst_qnan[63:0];
+                                  vfdsu_ex4_single  ? ex4_sing_rst_qnan[63:0] :
+                                  vfdsu_ex4_half    ? ex4_half_rst_qnan[63:0] : ex4_bfloat_rst_qnan[63:0];
 
 assign ex4_rst_norm[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_norm[63:0] :
-                                  vfdsu_ex4_single  ? ex4_sing_rst_norm[63:0]
-                                                    : ex4_half_rst_norm[63:0];
+                                  vfdsu_ex4_single  ? ex4_sing_rst_norm[63:0] :
+                                  vfdsu_ex4_half    ? ex4_half_rst_norm[63:0] : ex4_bfloat_rst_norm[63:0];
 assign ex4_rst_inf[63:0]       = (vfdsu_ex4_double) ? ex4_doub_rst_inf[63:0] :
-                                  vfdsu_ex4_single  ? ex4_sing_rst_inf[63:0]
-                                                    : ex4_half_rst_inf[63:0];
+                                  vfdsu_ex4_single  ? ex4_sing_rst_inf[63:0] :
+                                  vfdsu_ex4_half    ? ex4_half_rst_inf[63:0] : ex4_bfloat_rst_inf[63:0];
 
       
 assign ex4_cor_uf            = (vfdsu_ex4_uf && !ex4_denorm_potnt_norm || ex4_uf_plus)
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
index 7c5821c8..0ef958a3 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
@@ -25,6 +25,8 @@ module ct_vfdsu_prepare(
   ex1_remainder,
   ex1_scalar,
   ex1_single,
+  ex1_half,
+  ex1_bfloat,
   ex1_sqrt,
   ex1_src0,
   ex1_src1,
@@ -48,6 +50,8 @@ module ct_vfdsu_prepare(
   vfdsu_ex2_result_zero,
   vfdsu_ex2_rm,
   vfdsu_ex2_single,
+  vfdsu_ex2_half,
+  vfdsu_ex2_bfloat,
   vfdsu_ex2_sqrt,
   vfdsu_ex2_srt_skip,
   vfpu_yy_xx_dqnan,
@@ -63,6 +67,8 @@ input           ex1_double;
 input           ex1_pipedown;             
 input           ex1_scalar;               
 input           ex1_single;               
+input           ex1_half;
+input           ex1_bfloat;
 input           ex1_sqrt;                 
 input   [63:0]  ex1_src0;                 
 input   [63:0]  ex1_src1;                 
@@ -90,6 +96,8 @@ output          vfdsu_ex2_result_sign;
 output          vfdsu_ex2_result_zero;    
 output  [2 :0]  vfdsu_ex2_rm;             
 output          vfdsu_ex2_single;         
+output          vfdsu_ex2_half;
+output          vfdsu_ex2_bfloat;
 output          vfdsu_ex2_sqrt;           
 output          vfdsu_ex2_srt_skip;       
 
@@ -115,6 +123,8 @@ reg             vfdsu_ex2_result_sign;
 reg             vfdsu_ex2_result_zero;    
 reg     [2 :0]  vfdsu_ex2_rm;             
 reg             vfdsu_ex2_single;         
+reg             vfdsu_ex2_half;
+reg             vfdsu_ex2_bfloat;
 reg             vfdsu_ex2_sqrt;           
 reg             vfdsu_ex2_srt_skip;       
 
@@ -161,6 +171,12 @@ wire            ex1_half_expnt1_max;
 wire            ex1_half_expnt1_zero;     
 wire            ex1_half_frac0_all0;      
 wire            ex1_half_frac1_all0;      
+wire            ex1_bfloat_expnt0_max;
+wire            ex1_bfloat_expnt1_max;
+wire            ex1_bfloat_expnt0_zero;
+wire            ex1_bfloat_expnt1_zero;
+wire            ex1_bfloat_frac0_all0;
+wire            ex1_bfloat_frac1_all0;
 wire            ex1_nv;                   
 wire            ex1_op0_cnan;             
 wire    [51:0]  ex1_op0_f;                
@@ -216,6 +232,8 @@ wire            ex1_sing_expnt1_zero;
 wire            ex1_sing_frac0_all0;      
 wire            ex1_sing_frac1_all0;      
 wire            ex1_single;               
+wire            ex1_half;
+wire            ex1_bfloat;
 wire            ex1_sqrt;                 
 wire            ex1_sqrt_expnt_odd;       
 wire            ex1_sqrt_expnt_result_odd; 
@@ -246,9 +264,11 @@ assign ex1_oper1[63:0]             = ex1_src1[63:0];
 
 //Sign bit prepare
 assign ex1_op0_sign                =  ex1_double ? ex1_oper0[63] :
-                                      ex1_single ? ex1_oper0[31] : ex1_oper0[15]; 
+                                      ex1_single ? ex1_oper0[31] :
+                                      ex1_half   ? ex1_oper0[15] : ex1_oper0[15];
 assign ex1_op1_sign                =  ex1_double ? ex1_oper1[63] :
-                                      ex1_single ? ex1_oper1[31] : ex1_oper1[15]; 
+                                      ex1_single ? ex1_oper1[31] :
+                                      ex1_half   ? ex1_oper1[15] : ex1_oper1[15];
 assign div_sign                    = ex1_op0_sign ^ ex1_op1_sign;
 assign sqrt_sign                   = ex1_op0_sign;
 assign ex1_result_sign             = (ex1_div)
@@ -261,10 +281,14 @@ assign ex1_doub_expnt1_max         = &ex1_oper1[62:52];
 assign ex1_sing_expnt1_max         = &ex1_oper1[30:23];
 assign ex1_half_expnt0_max         = &ex1_oper0[14:10];
 assign ex1_half_expnt1_max         = &ex1_oper1[14:10];
+assign ex1_bfloat_expnt0_max       = &ex1_oper0[14:7];
+assign ex1_bfloat_expnt1_max       = &ex1_oper1[14:7];
 assign ex1_expnt0_max              = ex1_double ? ex1_doub_expnt0_max :
-                                     ex1_single ? ex1_sing_expnt0_max : ex1_half_expnt0_max;
+                                     ex1_single ? ex1_sing_expnt0_max :
+                                     ex1_half   ? ex1_half_expnt0_max : ex1_bfloat_expnt0_max;
 assign ex1_expnt1_max              = ex1_double ? ex1_doub_expnt1_max :
-                                     ex1_single ? ex1_sing_expnt1_max : ex1_half_expnt1_max;
+                                     ex1_single ? ex1_sing_expnt1_max :
+                                     ex1_half   ? ex1_half_expnt1_max : ex1_bfloat_expnt1_max;
              
 //exponent zero
 assign ex1_doub_expnt0_zero        = ~|ex1_oper0[62:52];
@@ -273,10 +297,15 @@ assign ex1_doub_expnt1_zero        = ~|ex1_oper1[62:52];
 assign ex1_sing_expnt1_zero        = ~|ex1_oper1[30:23];
 assign ex1_half_expnt0_zero        = ~|ex1_oper0[14:10];
 assign ex1_half_expnt1_zero        = ~|ex1_oper1[14:10];
+assign ex1_bfloat_expnt0_zero      = ~|ex1_oper0[14:7];
+assign ex1_bfloat_expnt1_zero      = ~|ex1_oper1[14:7];
 assign ex1_expnt0_zero             = ex1_double ? ex1_doub_expnt0_zero :
-                                     ex1_single ? ex1_sing_expnt0_zero : ex1_half_expnt0_zero;
+                                     ex1_single ? ex1_sing_expnt0_zero :
+                                     ex1_half   ? ex1_half_expnt0_zero : ex1_bfloat_expnt0_zero;
 assign ex1_expnt1_zero             = ex1_double ? ex1_doub_expnt1_zero :
-                                     ex1_single ? ex1_sing_expnt1_zero : ex1_half_expnt1_zero; 
+                                     ex1_single ? ex1_sing_expnt1_zero :
+                                     ex1_half   ? ex1_half_expnt1_zero : ex1_bfloat_expnt1_zero;
+
 //fraction zero
 assign ex1_doub_frac0_all0         = ~|ex1_oper0[51:0];
 assign ex1_sing_frac0_all0         = ~|ex1_oper0[22:0];
@@ -284,14 +313,20 @@ assign ex1_doub_frac1_all0         = ~|ex1_oper1[51:0];
 assign ex1_sing_frac1_all0         = ~|ex1_oper1[22:0];
 assign ex1_half_frac0_all0         = ~|ex1_oper0[9:0];
 assign ex1_half_frac1_all0         = ~|ex1_oper1[9:0];
+assign ex1_bfloat_frac0_all0       = ~|ex1_oper0[6:0];
+assign ex1_bfloat_frac1_all0       = ~|ex1_oper1[6:0];
 assign ex1_frac0_all0              = ex1_double ? ex1_doub_frac0_all0 :
-                                     ex1_single ? ex1_sing_frac0_all0 : ex1_half_frac0_all0;   
+                                     ex1_single ? ex1_sing_frac0_all0 :
+                                     ex1_half ?   ex1_half_frac0_all0 : ex1_bfloat_frac0_all0;
 assign ex1_frac1_all0              = ex1_double ? ex1_doub_frac1_all0 :
-                                     ex1_single ? ex1_sing_frac1_all0 : ex1_half_frac1_all0;   
+                                     ex1_single ? ex1_sing_frac1_all0 :
+                                     ex1_half ?   ex1_half_frac1_all0 : ex1_bfloat_frac1_all0;
 assign ex1_frac0_msb               = ex1_double ? ex1_oper0[51] :
-                                     ex1_single ? ex1_oper0[22] : ex1_oper0[9];
+                                     ex1_single ? ex1_oper0[22] :
+                                     ex1_half   ? ex1_oper0[9]  : ex1_oper0[6];
 assign ex1_frac1_msb               = ex1_double ? ex1_oper1[51] :
-                                     ex1_single ? ex1_oper1[22] : ex1_oper1[9]; 
+                                     ex1_single ? ex1_oper1[22] :
+                                     ex1_half   ? ex1_oper1[9]  : ex1_oper1[6];
 assign ex1_oper0_high_all1         = ex1_single ? &ex1_oper0[63:32] : &ex1_oper0[63:16]; 
 assign ex1_oper1_high_all1         = ex1_single ? &ex1_oper1[63:32] : &ex1_oper1[63:16];
  
@@ -382,25 +417,30 @@ ct_vfdsu_ff1  x_frac1_expnt (
 // &Connect(.frac_bin_val(ex1_oper1_id_expnt[12:0])); @157
 // &Connect(.fanc_shift_num(ex1_oper1_id_frac[51:0])); @158
 assign ex1_oper0_frac[51:0] = ex1_double ? ex1_oper0[51:0] :
-                                           ex1_single ? {ex1_oper0[22:0],29'b0}
-                                                      : {ex1_oper0[9:0],42'b0};
+                                           ex1_single ? {ex1_oper0[22:0],29'b0} :
+                                           ex1_half   ? {ex1_oper0[9:0],42'b0}
+                                                      : {ex1_oper0[6:0],45'b0};
 assign ex1_oper1_frac[51:0] = ex1_double ? ex1_oper1[51:0] :
-                                           ex1_single ? {ex1_oper1[22:0],29'b0}
-                                                      : {ex1_oper1[9:0],42'b0};
+                                           ex1_single ? {ex1_oper1[22:0],29'b0} :
+                                           ex1_half   ? {ex1_oper1[9:0],42'b0}
+                                                      : {ex1_oper1[6:0],45'b0};
 //=====================exponent add=========================
 //exponent number 0
 assign ex1_div_op0_expnt[12:0]     = ex1_double ? {2'b0,ex1_oper0[62:52]} : 
-                                                  ex1_single ? {5'b0,ex1_oper0[30:23]}
-                                                             : {8'b0,ex1_oper0[14:10]};
+                                                  ex1_single ? {5'b0,ex1_oper0[30:23]} :
+                                                  ex1_half   ? {8'b0,ex1_oper0[14:10]}
+                                                             : {5'b0,ex1_oper0[14:7]};
 assign ex1_expnt_adder_op0[12:0]   = ex1_op0_id_nor ? ex1_oper0_id_expnt[12:0]
                                                     : ex1_div_op0_expnt[12:0];
 //exponent number 1
 assign ex1_div_op1_expnt[12:0]  = ex1_double ? {2'b0,ex1_oper1[62:52]} :
-                                               ex1_single ? {5'b0,ex1_oper1[30:23]}
-                                                          : {8'b0,ex1_oper1[14:10]};
+                                               ex1_single ? {5'b0,ex1_oper1[30:23]} :
+                                               ex1_half   ? {8'b0,ex1_oper1[14:10]}
+                                                          : {5'b0,ex1_oper1[14:7]};
 assign ex1_sqrt_op1_expnt[12:0] = ex1_double ? {3'b0,{10{1'b1}}} : //'d1023
-                                               ex1_single ? {6'b0,{7{1'b1}}} //'d127
-                                                          : {9'b0,{4{1'b1}}}; //'d15
+                                               ex1_single ? {6'b0,{7{1'b1}}} ://'d127
+                                               ex1_half   ? {9'b0,{4{1'b1}}}  //'d15
+                                                          : {6'b0,{7{1'b1}}}; //'d127
   
 // &CombBeg;  @180
 always @( ex1_oper1_id_expnt[12:0]
@@ -569,11 +609,13 @@ assign ex1_div_srt_op0[52:0]     = ex1_div_nor_srt_op0[52:0];
 assign ex1_div_srt_op1[52:0]     =  ex1_div_nor_srt_op1[52:0];
 //ex1_div_nor_srt_op0
 assign ex1_div_noid_nor_srt_op0[52:0] = ex1_double ? {1'b1,ex1_oper0[51:0]} :
-                                                     ex1_single ? {1'b1,ex1_oper0[22:0],29'b0}
-                                                                : {1'b1,ex1_oper0[9:0],42'b0};
+                                                     ex1_single ? {1'b1,ex1_oper0[22:0],29'b0} :
+                                                     ex1_half   ? {1'b1,ex1_oper0[9:0],42'b0}
+                                                                : {1'b1,ex1_oper0[6:0],45'b0};
 assign ex1_div_noid_nor_srt_op1[52:0] = ex1_double ? {1'b1,ex1_oper1[51:0]} :
-                                                     ex1_single ? {1'b1,ex1_oper1[22:0],29'b0}
-                                                                : {1'b1,ex1_oper1[9:0],42'b0};
+                                                     ex1_single ? {1'b1,ex1_oper1[22:0],29'b0} :
+                                                     ex1_half   ? {1'b1,ex1_oper1[9:0],42'b0}
+                                                                : {1'b1,ex1_oper1[6:0],45'b0};
 assign ex1_div_nor_srt_op0[52:0] = ex1_op0_id_nor ? {ex1_oper0_id_frac[51:0],1'b0} 
                                                   : ex1_div_noid_nor_srt_op0[52:0];
 //ex1_div_nor_srt_op1
@@ -699,6 +741,8 @@ begin
     vfdsu_ex2_sqrt            <=  1'b0;
     vfdsu_ex2_double          <=  1'b0;
     vfdsu_ex2_single          <=  1'b0;
+    vfdsu_ex2_half            <=  1'b0;
+    vfdsu_ex2_bfloat          <=  1'b0;
   end
   else if(ex1_pipedown)
   begin
@@ -721,6 +765,8 @@ begin
     vfdsu_ex2_sqrt            <= ex1_sqrt;
     vfdsu_ex2_double          <= ex1_double;
     vfdsu_ex2_single          <= ex1_single;
+    vfdsu_ex2_half            <= ex1_half;
+    vfdsu_ex2_bfloat          <= ex1_bfloat;
   end
   else
   begin
@@ -743,6 +789,8 @@ begin
     vfdsu_ex2_sqrt            <= vfdsu_ex2_sqrt;
     vfdsu_ex2_double          <= vfdsu_ex2_double;
     vfdsu_ex2_single          <= vfdsu_ex2_single;
+    vfdsu_ex2_half            <= vfdsu_ex2_half;
+    vfdsu_ex2_bfloat          <= vfdsu_ex2_bfloat;
   end
 end
 
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
index 6eece526..cb3dc8e3 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
@@ -27,6 +27,7 @@ module ct_vfdsu_round(
   vfdsu_ex3_double,
   vfdsu_ex3_dz,
   vfdsu_ex3_half_expnt_rst,
+  vfdsu_ex3_bfloat_expnt_rst,
   vfdsu_ex3_id_srt_skip,
   vfdsu_ex3_nv,
   vfdsu_ex3_of,
@@ -46,6 +47,8 @@ module ct_vfdsu_round(
   vfdsu_ex3_rslt_denorm,
   vfdsu_ex3_sing_expnt_rst,
   vfdsu_ex3_single,
+  vfdsu_ex3_half,
+  vfdsu_ex3_bfloat,
   vfdsu_ex3_uf,
   vfdsu_ex4_denorm_to_tiny_frac,
   vfdsu_ex4_double,
@@ -69,6 +72,8 @@ module ct_vfdsu_round(
   vfdsu_ex4_result_zero,
   vfdsu_ex4_rslt_denorm,
   vfdsu_ex4_single,
+  vfdsu_ex4_half,
+  vfdsu_ex4_bfloat,
   vfdsu_ex4_uf
 );
 
@@ -85,6 +90,7 @@ input   [12:0]  vfdsu_ex3_doub_expnt_rst;
 input           vfdsu_ex3_double;                     
 input           vfdsu_ex3_dz;                         
 input   [12:0]  vfdsu_ex3_half_expnt_rst;             
+input   [12:0]  vfdsu_ex3_bfloat_expnt_rst;
 input           vfdsu_ex3_id_srt_skip;                
 input           vfdsu_ex3_nv;                         
 input           vfdsu_ex3_of;                         
@@ -104,6 +110,8 @@ input   [2 :0]  vfdsu_ex3_rm;
 input           vfdsu_ex3_rslt_denorm;                
 input   [8 :0]  vfdsu_ex3_sing_expnt_rst;             
 input           vfdsu_ex3_single;                     
+input           vfdsu_ex3_half;
+input           vfdsu_ex3_bfloat;
 input           vfdsu_ex3_uf;                         
 output          vfdsu_ex4_denorm_to_tiny_frac;        
 output          vfdsu_ex4_double;                     
@@ -127,6 +135,8 @@ output          vfdsu_ex4_result_sign;
 output          vfdsu_ex4_result_zero;                
 output          vfdsu_ex4_rslt_denorm;                
 output          vfdsu_ex4_single;                     
+output          vfdsu_ex4_half;
+output          vfdsu_ex4_bfloat;
 output          vfdsu_ex4_uf;                         
 
 // &Regs; @24
@@ -138,8 +148,10 @@ reg             frac_orig;
 reg     [54:0]  frac_sub1_op1;                        
 reg             frac_sub_1;                           
 reg             half_denorm_lst_frac;                 
+reg             bfloat_denorm_lst_frac;
 reg     [56:0]  qt_result_double_denorm_for_round;    
 reg     [13:0]  qt_result_half_denorm_for_round;      
+reg     [10:0]  qt_result_bfloat_denorm_for_round;
 reg     [27:0]  qt_result_single_denorm_for_round;    
 reg             single_denorm_lst_frac;               
 reg             vfdsu_ex4_denorm_to_tiny_frac;        
@@ -164,6 +176,8 @@ reg             vfdsu_ex4_result_sign;
 reg             vfdsu_ex4_result_zero;                
 reg             vfdsu_ex4_rslt_denorm;                
 reg             vfdsu_ex4_single;                     
+reg             vfdsu_ex4_half;
+reg             vfdsu_ex4_bfloat;
 reg             vfdsu_ex4_uf;                         
 
 // &Wires; @25
@@ -199,6 +213,16 @@ wire            ex3_half_gr;
 wire            ex3_half_low_not_zero;                
 wire            ex3_half_rst_eq_1;                    
 wire            ex3_half_zero;                        
+wire            ex3_bfloat_denorm_eq;
+wire            ex3_bfloat_denorm_gr;
+wire            ex3_bfloat_denorm_plus;
+wire            ex3_bfloat_denorm_potnt_norm;
+wire            ex3_bfloat_denorm_zero;
+wire            ex3_bfloat_eq;
+wire            ex3_bfloat_gr;
+wire            ex3_bfloat_low_not_zero;
+wire            ex3_bfloat_rst_eq_1;
+wire            ex3_bfloat_zero;
 wire            ex3_nx;                               
 wire            ex3_pipe_clk;                         
 wire            ex3_pipe_clk_en;                      
@@ -210,6 +234,8 @@ wire            ex3_qt_eq;
 wire            ex3_qt_gr;                            
 wire            ex3_qt_half_lo2_not0;                 
 wire            ex3_qt_half_lo3_not0;                 
+wire            ex3_qt_bfloat_lo2_not0;
+wire            ex3_qt_bfloat_lo3_not0;
 wire            ex3_qt_sing_lo3_not0;                 
 wire            ex3_qt_sing_lo4_not0;                 
 wire            ex3_qt_zero;                          
@@ -254,6 +280,7 @@ wire            vfdsu_ex3_double;
 wire            vfdsu_ex3_dz;                         
 wire    [12:0]  vfdsu_ex3_expnt_rst;                  
 wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
+wire    [12:0]  vfdsu_ex3_bfloat_expnt_rst;
 wire            vfdsu_ex3_id_srt_skip;                
 wire            vfdsu_ex3_nv;                         
 wire            vfdsu_ex3_of;                         
@@ -273,6 +300,8 @@ wire    [2 :0]  vfdsu_ex3_rm;
 wire            vfdsu_ex3_rslt_denorm;                
 wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
 wire            vfdsu_ex3_single;                     
+wire            vfdsu_ex3_half;
+wire            vfdsu_ex3_bfloat;
 wire            vfdsu_ex3_uf;                         
 
 
@@ -302,6 +331,22 @@ assign ex3_half_zero        = (total_qt_rt_58[56])
 assign ex3_half_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:46];       
 assign ex3_half_denorm_plus = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff2);
 assign ex3_half_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff1);
+
+assign ex3_qt_bfloat_lo3_not0 = |total_qt_rt_58[47:45];
+assign ex3_qt_bfloat_lo2_not0 = |total_qt_rt_58[46:45];
+assign ex3_bfloat_gr       = total_qt_rt_58[56]
+                              ? total_qt_rt_58[48] && ex3_qt_bfloat_lo3_not0
+                              : total_qt_rt_58[47] && ex3_qt_bfloat_lo2_not0;
+assign ex3_bfloat_eq          = (total_qt_rt_58[56])
+                            ?  total_qt_rt_58[48] && !ex3_qt_sing_lo4_not0
+                            :  total_qt_rt_58[47] && !ex3_qt_sing_lo3_not0;
+assign ex3_bfloat_zero        = (total_qt_rt_58[56])
+                            ? ~|total_qt_rt_58[48:45]
+                            : ~|total_qt_rt_58[47:45];
+assign ex3_bfloat_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:49];
+assign ex3_bfloat_denorm_plus = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f82);
+assign ex3_bfloat_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f81);
+
 assign vfdsu_ex3_expnt_rst[12:0]  = vfdsu_ex3_half_expnt_rst[12:0];
 // &Force("bus","total_qt_rt_58",57,0); @54
 assign ex3_qt_doub_lo3_not0 = |total_qt_rt_58[2:0]; 
@@ -343,19 +388,24 @@ assign ex3_doub_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[1
 assign ex3_sing_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f81);
 assign ex3_rslt_denorm            = ex3_denorm_plus || vfdsu_ex3_rslt_denorm;
 assign ex3_denorm_potnt_norm      = vfdsu_ex3_double ? ex3_doub_denorm_potnt_norm :
-                                                       vfdsu_ex3_single ? ex3_sing_denorm_potnt_norm
-                                                                        : ex3_half_denorm_potnt_norm;
+                                                       vfdsu_ex3_single ? ex3_sing_denorm_potnt_norm :
+                                                       vfdsu_ex3_half   ? ex3_half_denorm_potnt_norm
+                                                                        : ex3_bfloat_denorm_potnt_norm;
 assign ex3_rst_eq_1         = (vfdsu_ex3_double)? ex3_doub_rst_eq_1 :
-                               vfdsu_ex3_single ? ex3_sing_rst_eq_1 : ex3_half_rst_eq_1;
+                               vfdsu_ex3_single ? ex3_sing_rst_eq_1 :
+                               vfdsu_ex3_half   ? ex3_half_rst_eq_1 : ex3_bfloat_rst_eq_1;
 assign ex3_qt_eq            = (vfdsu_ex3_double)? ex3_doub_eq :
-                               vfdsu_ex3_single ? ex3_sing_eq : ex3_half_eq;
+                               vfdsu_ex3_single ? ex3_sing_eq :
+                               vfdsu_ex3_half   ? ex3_half_eq : ex3_bfloat_eq;
 assign ex3_qt_gr            = (vfdsu_ex3_double)? ex3_doub_gr :
-                               vfdsu_ex3_single ? ex3_sing_gr : ex3_half_gr;
+                               vfdsu_ex3_single ? ex3_sing_gr :
+                               vfdsu_ex3_half   ? ex3_half_gr : ex3_bfloat_gr;
 assign ex3_qt_zero          = (vfdsu_ex3_double)? ex3_doub_zero :
-                               vfdsu_ex3_single ? ex3_sing_zero : ex3_half_zero;
+                               vfdsu_ex3_single ? ex3_sing_zero :
+                               vfdsu_ex3_half   ? ex3_half_zero : ex3_bfloat_zero;
 assign ex3_denorm_plus            = (vfdsu_ex3_double)  ? ex3_doub_denorm_plus 
                                     : vfdsu_ex3_single ? ex3_sing_denorm_plus
-                                                       : ex3_half_denorm_plus;
+                                    : vfdsu_ex3_half  ? ex3_half_denorm_plus : ex3_bfloat_denorm_plus;
                              
 // &CombBeg; @108
 always @( vfdsu_ex3_doub_expnt_rst[12:0]
@@ -682,14 +732,63 @@ assign ex3_half_denorm_gr      = qt_result_half_denorm_for_round[13]
 assign ex3_half_denorm_zero    = !qt_result_half_denorm_for_round[13] 
                                    && !ex3_half_low_not_zero;
 
+always @( vfdsu_ex3_bfloat_expnt_rst[8:0]
+       or total_qt_rt_58[56:45])
+begin
+case(vfdsu_ex3_bfloat_expnt_rst[8:0])
+  9'h182:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[48:45],7'b0}; //-126 1
+                bfloat_denorm_lst_frac =  total_qt_rt_58[49];
+          end//-1022 1
+  9'h181:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[49:45],6'b0}; //-127 0
+                bfloat_denorm_lst_frac =  total_qt_rt_58[50];
+          end//-1022 1
+  9'h180:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[50:45],5'b0}; //-128 -1
+                bfloat_denorm_lst_frac =  total_qt_rt_58[51];
+          end//-1022 1
+  9'h17f:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[51:45],4'b0}; //-129 -2
+                bfloat_denorm_lst_frac =  total_qt_rt_58[52];
+          end//-1022 1
+  9'h17e:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[52:45],3'b0}; //-90 -3
+                bfloat_denorm_lst_frac =  total_qt_rt_58[53];
+          end//-1022 1
+  9'h17d:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[53:45],2'b0}; //-91 -4
+                bfloat_denorm_lst_frac =  total_qt_rt_58[54];
+          end//-1022 1
+  9'h17c:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[54:45],1'b0}; //-92 -5
+                bfloat_denorm_lst_frac =  total_qt_rt_58[55];
+          end//-1022 1
+  9'h17b:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[55:45]}; //-93 -6
+                bfloat_denorm_lst_frac =  total_qt_rt_58[56];
+          end//-1022 1
+  9'h17a:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[56:46]}; //-93 -6
+                bfloat_denorm_lst_frac =  1'b0;
+          end//-1022 1
+  default:  begin qt_result_bfloat_denorm_for_round[10:0] = '0;
+                 bfloat_denorm_lst_frac = 1'b0;
+            end//-1022 1
+endcase
+end
+//rounding evaluation for bfloat denormalize number
+assign ex3_bfloat_denorm_eq      = qt_result_bfloat_denorm_for_round[10]
+                                   &&  !ex3_bfloat_low_not_zero;
+assign ex3_bfloat_low_not_zero   = |qt_result_bfloat_denorm_for_round[9:0];
+assign ex3_bfloat_denorm_gr      = qt_result_bfloat_denorm_for_round[10]
+                                   &&  ex3_bfloat_low_not_zero;
+assign ex3_bfloat_denorm_zero    = !qt_result_bfloat_denorm_for_round[10]
+                                   && !ex3_bfloat_low_not_zero;
+
 assign ex3_denorm_eq             = vfdsu_ex3_double ? ex3_double_denorm_eq :
-                                   vfdsu_ex3_single ? ex3_single_denorm_eq : ex3_half_denorm_eq;
+                                   vfdsu_ex3_single ? ex3_single_denorm_eq :
+                                   vfdsu_ex3_half   ? ex3_half_denorm_eq   : ex3_bfloat_denorm_eq;
 assign ex3_denorm_gr             = vfdsu_ex3_double ? ex3_double_denorm_gr :
-                                   vfdsu_ex3_single ? ex3_single_denorm_gr : ex3_half_denorm_gr;
+                                   vfdsu_ex3_single ? ex3_single_denorm_gr :
+                                   vfdsu_ex3_half   ? ex3_half_denorm_gr   : ex3_bfloat_denorm_gr;
 assign ex3_denorm_zero           = vfdsu_ex3_double ? ex3_double_denorm_zero :
-                                   vfdsu_ex3_single ? ex3_single_denorm_zero : ex3_half_denorm_zero;
+                                   vfdsu_ex3_single ? ex3_single_denorm_zero :
+                                   vfdsu_ex3_half   ? ex3_half_denorm_zero   : ex3_bfloat_denorm_zero;
 assign ex3_denorm_lst_frac       = vfdsu_ex3_double ? double_denorm_lst_frac :
-                                   vfdsu_ex3_single ? single_denorm_lst_frac : half_denorm_lst_frac;
+                                   vfdsu_ex3_single ? single_denorm_lst_frac :
+                                   vfdsu_ex3_half   ? half_denorm_lst_frac   : bfloat_denorm_lst_frac;
   
 //Different Round Mode with different rounding rule
 //Here we call rounding bit as "rb", remainder as "rem"
@@ -824,7 +923,9 @@ end
 // &CombBeg; @540
 always @( total_qt_rt_58[56]
        or vfdsu_ex3_single
-       or vfdsu_ex3_double)
+       or vfdsu_ex3_double
+       or vfdsu_ex3_half
+       or vfdsu_ex3_bfloat)
 begin
 case({total_qt_rt_58[56],vfdsu_ex3_double,vfdsu_ex3_single})
   3'b001: 
@@ -849,13 +950,23 @@ case({total_qt_rt_58[56],vfdsu_ex3_double,vfdsu_ex3_single})
   end
   3'b100:
   begin
-    frac_add1_op1[54:0] = {12'b1,43'b0};
-    frac_sub1_op1[54:0] = {{12{1'b1}},43'b0};
+    if (vfdsu_ex3_half) begin
+      frac_add1_op1[54:0] = {12'b1,43'b0};
+      frac_sub1_op1[54:0] = {{12{1'b1}},43'b0};
+    end else begin
+      frac_add1_op1[54:0] = {9'b1,46'b0};
+      frac_sub1_op1[54:0] = {{9{1'b1}},46'b0};
+    end
   end
   3'b000:
   begin
-    frac_add1_op1[54:0] = {13'b1,42'b0};
-    frac_sub1_op1[54:0] = {{13{1'b1}},42'b0};
+    if (vfdsu_ex3_half) begin
+      frac_add1_op1[54:0] = {13'b1,42'b0};
+      frac_sub1_op1[54:0] = {{13{1'b1}},42'b0};
+    end else begin
+      frac_add1_op1[54:0] = {10'b1,45'b0};
+      frac_sub1_op1[54:0] = {{10{1'b1}},45'b0};
+    end
   end
   default:
   begin
@@ -898,7 +1009,7 @@ assign ex3_nx      = ex3_rst_nor &&
 assign ex3_denorm_nx = ex3_rslt_denorm && (!ex3_denorm_zero ||  !vfdsu_ex3_rem_zero);
 //Adjust expnt
 //Div:Actural expnt should plus 1 when op0 is id, sub 1 when op1 id
-assign ex3_expnt_adjst[12:0] = vfdsu_ex3_double ? 13'h3ff: vfdsu_ex3_single ? 13'h7f : 13'hf;
+assign ex3_expnt_adjst[12:0] = vfdsu_ex3_double ? 13'h3ff: vfdsu_ex3_single ? 13'h7f : vfdsu_ex3_half ? 13'hf : 13'h7f;
 assign ex3_expnt_adjust_result[12:0] = vfdsu_ex3_expnt_rst[12:0] + 
                                        ex3_expnt_adjst[12:0];
 //this information is for the packing, which determin the result is normal
@@ -954,6 +1065,8 @@ begin
     vfdsu_ex4_potnt_norm[1:0] <= 2'b0;
     vfdsu_ex4_double          <= 1'b0;
     vfdsu_ex4_single          <= 1'b0;
+    vfdsu_ex4_half            <= 1'b0;
+    vfdsu_ex4_bfloat          <= 1'b0;
 
   end
   else if(ex3_pipedown)
@@ -982,6 +1095,8 @@ begin
     vfdsu_ex4_potnt_norm[1:0] <= ex3_potnt_norm[1:0];
     vfdsu_ex4_double          <= vfdsu_ex3_double;
     vfdsu_ex4_single          <= vfdsu_ex3_single;
+    vfdsu_ex4_half            <= vfdsu_ex3_half;
+    vfdsu_ex4_bfloat          <= vfdsu_ex3_bfloat;
   end
   else
   begin
@@ -1009,6 +1124,8 @@ begin
     vfdsu_ex4_potnt_norm[1:0] <= vfdsu_ex4_potnt_norm[1:0];
     vfdsu_ex4_double          <= vfdsu_ex4_double;
     vfdsu_ex4_single          <= vfdsu_ex4_single;
+    vfdsu_ex4_half            <= vfdsu_ex4_half;
+    vfdsu_ex4_bfloat          <= vfdsu_ex4_bfloat;
   end  
 end    
 
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
index c7a679c1..4d91a2cc 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
@@ -30,6 +30,8 @@ module ct_vfdsu_scalar_dp(
   ex1_double,
   ex1_pipedown,
   ex1_scalar,
+  ex1_half,
+  ex1_bfloat,
   ex1_single,
   ex1_sqrt,
   ex1_src0,
@@ -50,7 +52,9 @@ module ct_vfdsu_scalar_dp(
   pipex_dp_vfdsu_freg_data,
   pipex_dp_vfdsu_vreg,
   vfdsu_ex2_double,
-  vfdsu_ex2_single
+  vfdsu_ex2_single,
+  vfdsu_ex2_half,
+  vfdsu_ex2_bfloat
 );
 
 // &Ports; @24
@@ -79,6 +83,8 @@ output          ex1_div;
 output          ex1_double;                   
 output          ex1_scalar;                   
 output          ex1_single;                   
+output          ex1_half;
+output          ex1_bfloat;
 output          ex1_sqrt;                     
 output  [63:0]  ex1_src0;                     
 output  [63:0]  ex1_src1;                     
@@ -89,11 +95,15 @@ output  [63:0]  pipex_dp_vfdsu_freg_data;
 output  [6 :0]  pipex_dp_vfdsu_vreg;          
 output          vfdsu_ex2_double;             
 output          vfdsu_ex2_single;             
+output          vfdsu_ex2_half;
+output          vfdsu_ex2_bfloat;
 
 // &Regs; @25
 reg             ex1_div;                      
 reg             ex1_double;                   
 reg             ex1_single;                   
+reg             ex1_half;
+reg             ex1_bfloat;
 reg             ex1_sqrt;                     
 reg             vfdsu_ex2_div;                
 reg             vfdsu_ex2_double;             
@@ -101,6 +111,8 @@ reg     [4 :0]  vfdsu_ex2_dst_ereg;
 reg     [6 :0]  vfdsu_ex2_dst_vreg;           
 reg     [6 :0]  vfdsu_ex2_iid;                
 reg             vfdsu_ex2_single;             
+reg             vfdsu_ex2_half;
+reg             vfdsu_ex2_bfloat;
 reg             vfdsu_ex2_sqrt;               
 reg     [4 :0]  vfdsu_ex3_dst_ereg;           
 reg     [6 :0]  vfdsu_ex3_dst_vreg;           
@@ -175,6 +187,8 @@ begin
     ex1_sqrt           <= 1'b0;
     ex1_double         <= 1'b0;
     ex1_single         <= 1'b0;
+    ex1_half           <= 1'b0;
+    ex1_bfloat         <= 1'b0;
   end
   else if(idu_vfpu_rf_pipex_gateclk_sel)
   begin
@@ -182,6 +196,8 @@ begin
     ex1_sqrt           <= idu_vfpu_rf_pipex_func[1];
     ex1_double         <= idu_vfpu_rf_pipex_func[16];
     ex1_single         <= idu_vfpu_rf_pipex_func[15];
+    ex1_half           <= idu_vfpu_rf_pipex_func[14];
+    ex1_bfloat         <= idu_vfpu_rf_pipex_func[13];
   end
 end
 assign ex1_scalar         = 1'b1;
@@ -204,6 +220,8 @@ begin
     vfdsu_ex2_iid[6:0]      <= 7'b0;
     vfdsu_ex2_double        <= 1'b0;
     vfdsu_ex2_single        <= 1'b0;
+    vfdsu_ex2_half          <= 1'b0;
+    vfdsu_ex2_bfloat        <= 1'b0;
     vfdsu_ex2_div           <=  1'b0;
     vfdsu_ex2_sqrt          <=  1'b0;
   end
@@ -214,6 +232,8 @@ begin
     vfdsu_ex2_iid[6:0]      <= dp_vfdsu_ex1_pipex_iid[6:0];
     vfdsu_ex2_double        <= ex1_double;
     vfdsu_ex2_single        <= ex1_single;
+    vfdsu_ex2_half          <= ex1_half;
+    vfdsu_ex2_bfloat        <= ex1_bfloat;
     vfdsu_ex2_div           <= ex1_div;
     vfdsu_ex2_sqrt          <= ex1_sqrt;
   end
@@ -224,6 +244,8 @@ begin
     vfdsu_ex2_iid[6:0]      <= vfdsu_ex2_iid[6:0];
     vfdsu_ex2_double        <= vfdsu_ex2_double;
     vfdsu_ex2_single        <= vfdsu_ex2_single;
+    vfdsu_ex2_half          <= ex1_half;
+    vfdsu_ex2_bfloat        <= ex1_bfloat;
     vfdsu_ex2_div           <= vfdsu_ex2_div;
     vfdsu_ex2_sqrt          <= vfdsu_ex2_sqrt;
   end
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
index cdeb3a30..4e2c68b0 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
@@ -49,12 +49,15 @@ module ct_vfdsu_srt(
   vfdsu_ex2_result_zero,
   vfdsu_ex2_rm,
   vfdsu_ex2_single,
+  vfdsu_ex2_half,
+  vfdsu_ex2_bfloat,
   vfdsu_ex2_sqrt,
   vfdsu_ex2_srt_skip,
   vfdsu_ex3_doub_expnt_rst,
   vfdsu_ex3_double,
   vfdsu_ex3_dz,
   vfdsu_ex3_half_expnt_rst,
+  vfdsu_ex3_bfloat_expnt_rst,
   vfdsu_ex3_id_srt_skip,
   vfdsu_ex3_nv,
   vfdsu_ex3_of,
@@ -74,6 +77,8 @@ module ct_vfdsu_srt(
   vfdsu_ex3_rslt_denorm,
   vfdsu_ex3_sing_expnt_rst,
   vfdsu_ex3_single,
+  vfdsu_ex3_half,
+  vfdsu_ex3_bfloat,
   vfdsu_ex3_uf
 );
 
@@ -109,6 +114,8 @@ input           vfdsu_ex2_result_sign;
 input           vfdsu_ex2_result_zero;                 
 input   [2 :0]  vfdsu_ex2_rm;                          
 input           vfdsu_ex2_single;                      
+input           vfdsu_ex2_half;
+input           vfdsu_ex2_bfloat;
 input           vfdsu_ex2_sqrt;                        
 input           vfdsu_ex2_srt_skip;                    
 output          srt_ctrl_rem_zero;                     
@@ -118,6 +125,7 @@ output  [12:0]  vfdsu_ex3_doub_expnt_rst;
 output          vfdsu_ex3_double;                      
 output          vfdsu_ex3_dz;                          
 output  [12:0]  vfdsu_ex3_half_expnt_rst;              
+output  [12:0]  vfdsu_ex3_bfloat_expnt_rst;
 output          vfdsu_ex3_id_srt_skip;                 
 output          vfdsu_ex3_nv;                          
 output          vfdsu_ex3_of;                          
@@ -137,16 +145,20 @@ output  [2 :0]  vfdsu_ex3_rm;
 output          vfdsu_ex3_rslt_denorm;                 
 output  [8 :0]  vfdsu_ex3_sing_expnt_rst;              
 output          vfdsu_ex3_single;                      
+output          vfdsu_ex3_half;
+output          vfdsu_ex3_bfloat;
 output          vfdsu_ex3_uf;                          
 
 // &Regs; @24
 reg     [52:0]  ex2_result_double_denorm_round_add_num; 
 reg     [52:0]  ex2_result_half_denorm_round_add_num;  
 reg     [52:0]  ex2_result_single_denorm_round_add_num; 
+reg     [52:0]  ex2_result_bfloat_denorm_round_add_num;
 reg     [12:0]  vfdsu_ex3_doub_expnt_rst;              
 reg             vfdsu_ex3_double;                      
 reg             vfdsu_ex3_dz;                          
 reg     [12:0]  vfdsu_ex3_half_expnt_rst;              
+reg     [12:0]  vfdsu_ex3_bfloat_expnt_rst;
 reg             vfdsu_ex3_id_srt_skip;                 
 reg             vfdsu_ex3_nv;                          
 reg             vfdsu_ex3_of;                          
@@ -165,6 +177,8 @@ reg     [2 :0]  vfdsu_ex3_rm;
 reg             vfdsu_ex3_rslt_denorm;                 
 reg     [8 :0]  vfdsu_ex3_sing_expnt_rst;              
 reg             vfdsu_ex3_single;                      
+reg             vfdsu_ex3_half;
+reg             vfdsu_ex3_bfloat;
 reg             vfdsu_ex3_uf;                          
 
 // &Wires; @25
@@ -191,6 +205,11 @@ wire            ex2_half_expnt_uf;
 wire            ex2_half_id_nor_srt_skip;              
 wire            ex2_half_potnt_of;                     
 wire            ex2_half_potnt_uf;                     
+wire            ex2_bfloat_expnt_of;
+wire            ex2_bfloat_expnt_uf;
+wire            ex2_bfloat_id_nor_srt_skip;
+wire            ex2_bfloat_potnt_of;
+wire            ex2_bfloat_potnt_uf;
 wire            ex2_id_nor_srt_skip;                   
 wire            ex2_of;                                
 wire            ex2_of_plus;                           
@@ -253,6 +272,8 @@ wire            vfdsu_ex2_result_sign;
 wire            vfdsu_ex2_result_zero;                 
 wire    [2 :0]  vfdsu_ex2_rm;                          
 wire            vfdsu_ex2_single;                      
+wire            vfdsu_ex2_half;
+wire            vfdsu_ex2_bfloat;
 wire            vfdsu_ex2_sqrt;                        
 wire            vfdsu_ex2_srt_skip;                    
 wire            vfdsu_ex3_rem_zero;                    
@@ -281,25 +302,33 @@ assign ex2_sing_expnt_of = ~vfdsu_ex2_expnt_rst[9] && (vfdsu_ex2_expnt_rst[8]
 assign ex2_half_expnt_of = ~vfdsu_ex2_expnt_rst[6] && (vfdsu_ex2_expnt_rst[5] 
                                                       || (vfdsu_ex2_expnt_rst[4]  &&
                                                           |vfdsu_ex2_expnt_rst[3:0]));
+assign ex2_bfloat_expnt_of = ~vfdsu_ex2_expnt_rst[9] && (vfdsu_ex2_expnt_rst[8]
+                                                      || (vfdsu_ex2_expnt_rst[7]  &&
+                                                          |vfdsu_ex2_expnt_rst[6:0]));
 assign ex2_expnt_of      = vfdsu_ex2_double ? ex2_doub_expnt_of :
-                                              vfdsu_ex2_single  ? ex2_sing_expnt_of
-                                                                : ex2_half_expnt_of;
+                                              vfdsu_ex2_single  ? ex2_sing_expnt_of :
+                                              vfdsu_ex2_half    ? ex2_half_expnt_of : ex2_bfloat_expnt_of;
 assign ex2_potnt_of_pre  = vfdsu_ex2_double ? ex2_doub_potnt_of :
-                           vfdsu_ex2_single ? ex2_sing_potnt_of : ex2_half_potnt_of;   
-assign ex2_potnt_uf_pre  = vfdsu_ex2_double ? ex2_doub_potnt_uf : 
-                           vfdsu_ex2_single ? ex2_sing_potnt_uf : ex2_half_potnt_uf;
+                           vfdsu_ex2_single ? ex2_sing_potnt_of :
+                           vfdsu_ex2_half   ? ex2_half_potnt_of : ex2_bfloat_potnt_of;
+assign ex2_potnt_uf_pre  = vfdsu_ex2_double ? ex2_doub_potnt_uf :
+                           vfdsu_ex2_single ? ex2_sing_potnt_uf :
+                           vfdsu_ex2_half   ? ex2_half_potnt_uf : ex2_bfloat_potnt_uf;
 assign ex2_expnt_uf      = vfdsu_ex2_double ? ex2_doub_expnt_uf :
-                           vfdsu_ex2_single ? ex2_sing_expnt_uf : ex2_half_expnt_uf;
+                           vfdsu_ex2_single ? ex2_sing_expnt_uf :
+                           vfdsu_ex2_half   ? ex2_half_expnt_uf : ex2_bfloat_expnt_uf;
 assign ex2_id_nor_srt_skip   = vfdsu_ex2_double ? ex2_double_id_nor_srt_skip :
-                               vfdsu_ex2_single ? ex2_single_id_nor_srt_skip
-                                                : ex2_half_id_nor_srt_skip; 
+                               vfdsu_ex2_single ? ex2_single_id_nor_srt_skip :
+                               vfdsu_ex2_half   ? ex2_half_id_nor_srt_skip   : ex2_bfloat_id_nor_srt_skip;
 assign ex2_result_denorm_round_add_num[52:0] = vfdsu_ex2_double ? 
                                                ex2_result_double_denorm_round_add_num[52:0] :
                                                vfdsu_ex2_single ? 
                                                ex2_result_single_denorm_round_add_num[52:0] :
-                                               ex2_result_half_denorm_round_add_num[52:0];
-                                             
-                                                      
+                                               vfdsu_ex2_half   ?
+                                               ex2_result_half_denorm_round_add_num[52:0] :
+                                               ex2_result_bfloat_denorm_round_add_num[52:0];
+
+
 //potential overflow when E1-E2 = 128/1024
 assign ex2_doub_potnt_of = ~vfdsu_ex2_expnt_rst[12] && 
                            ~vfdsu_ex2_expnt_rst[11] &&
@@ -313,6 +342,10 @@ assign ex2_half_potnt_of = ~vfdsu_ex2_expnt_rst[6]  &&
                            ~vfdsu_ex2_expnt_rst[5]  &&
                             vfdsu_ex2_expnt_rst[4]  &&
                           ~|vfdsu_ex2_expnt_rst[3:0];  
+assign ex2_bfloat_potnt_of = ~vfdsu_ex2_expnt_rst[9]  &&
+                           ~vfdsu_ex2_expnt_rst[8]  &&
+                            vfdsu_ex2_expnt_rst[7]  &&
+                          ~|vfdsu_ex2_expnt_rst[6:0];
 assign ex2_potnt_of      = ex2_potnt_of_pre && 
                            vfdsu_ex2_op0_norm && 
                            vfdsu_ex2_op1_norm && 
@@ -321,6 +354,7 @@ assign ex2_potnt_of      = ex2_potnt_of_pre &&
 //When input is normal, underflow when E1-E2 <= -127/-1023/-15
 assign ex2_doub_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hc01);
 assign ex2_sing_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hf81);
+assign ex2_bfloat_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hf81);
 assign ex2_half_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hff1);
 assign ex2_half_potnt_uf = &vfdsu_ex2_expnt_rst[6:4]   &&
                           ~|vfdsu_ex2_expnt_rst[3:2]   &&
@@ -337,6 +371,10 @@ assign ex2_sing_potnt_uf = &vfdsu_ex2_expnt_rst[9:7]   &&
                           ~|vfdsu_ex2_expnt_rst[6:2]   &&
                             vfdsu_ex2_expnt_rst[1]     &&
                            !vfdsu_ex2_expnt_rst[0];
+assign ex2_bfloat_potnt_uf = &vfdsu_ex2_expnt_rst[9:7]   &&
+                          ~|vfdsu_ex2_expnt_rst[6:2]   &&
+                            vfdsu_ex2_expnt_rst[1]     &&
+                           !vfdsu_ex2_expnt_rst[0];
 
 assign ex2_potnt_uf      = (ex2_potnt_uf_pre && 
                             vfdsu_ex2_op0_norm && 
@@ -371,6 +409,8 @@ assign ex2_single_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12]
                                      && (vfdsu_ex2_expnt_rst[11:0]<12'hf6a);
 assign ex2_half_id_nor_srt_skip   =  vfdsu_ex2_expnt_rst[12] 
                                      && (vfdsu_ex2_expnt_rst[11:0]<12'hfe7);
+assign ex2_bfloat_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12]
+                                     && (vfdsu_ex2_expnt_rst[11:0]<12'hf6a);
 assign ex2_rslt_denorm            = ex2_uf;
 
 //=======================EX2 skip srt iteration======================
@@ -490,6 +530,21 @@ endcase
 // &CombEnd; @248
 end
 
+always @( vfdsu_ex2_expnt_rst[12:0])
+begin
+case(vfdsu_ex2_expnt_rst[12:0])
+  13'h1f82:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h200000000000; //-126 1
+  13'h1f81:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h400000000000; //-127 0
+  13'h1f80:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h800000000000; //-128 -1
+  13'h1f7f:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h1000000000000; //-129 -2
+  13'h1f7e:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h2000000000000; //-130 -3
+  13'h1f7d:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h4000000000000; //-131 -4
+  13'h1f7c:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h8000000000000; //-132 -5
+  13'h1f7b:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h10000000000000; //-133 -6
+  default: ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h0;  // -23
+endcase
+end
+
 //===================special result========================
 assign ex2_result_zero = vfdsu_ex2_result_zero;
 assign ex2_result_qnan = vfdsu_ex2_result_qnan;
@@ -541,6 +596,7 @@ begin
     vfdsu_ex3_doub_expnt_rst[12:0] <= 13'b0;
     vfdsu_ex3_sing_expnt_rst[8:0] <= 9'b0;
     vfdsu_ex3_half_expnt_rst[12:0] <= 13'b0;
+    vfdsu_ex3_bfloat_expnt_rst[12:0] <= 13'b0;
     vfdsu_ex3_result_sign     <= 1'b0;
     vfdsu_ex3_qnan_sign       <= 1'b0;    
     vfdsu_ex3_qnan_f[51:0]    <= 52'b0;
@@ -551,6 +607,8 @@ begin
     vfdsu_ex3_id_srt_skip     <= 1'b0;
     vfdsu_ex3_double          <=  1'b0;
     vfdsu_ex3_single          <=  1'b0;
+    vfdsu_ex3_half            <=  1'b0;
+    vfdsu_ex3_bfloat          <=  1'b0;
   end
   else if(ex2_pipedown)
   begin
@@ -569,6 +627,7 @@ begin
     vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
     vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex2_expnt_rst[8:0];
     vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
+    vfdsu_ex3_bfloat_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
     vfdsu_ex3_result_sign     <= vfdsu_ex2_result_sign;
     vfdsu_ex3_qnan_sign       <= vfdsu_ex2_qnan_sign;    
     vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex2_qnan_f[51:0];
@@ -579,6 +638,8 @@ begin
     vfdsu_ex3_id_srt_skip     <= ex2_id_nor_srt_skip;
     vfdsu_ex3_double          <= vfdsu_ex2_double;
     vfdsu_ex3_single          <= vfdsu_ex2_single;
+    vfdsu_ex3_half            <= vfdsu_ex2_half;
+    vfdsu_ex3_bfloat          <= vfdsu_ex2_bfloat;
   end
   else
   begin
@@ -597,6 +658,7 @@ begin
     vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex3_doub_expnt_rst[12:0];
     vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex3_sing_expnt_rst[8:0];
     vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex3_half_expnt_rst[12:0];
+    vfdsu_ex3_bfloat_expnt_rst[12:0] <= vfdsu_ex3_bfloat_expnt_rst[12:0];
     vfdsu_ex3_result_sign     <= vfdsu_ex3_result_sign;
     vfdsu_ex3_qnan_sign       <= vfdsu_ex3_qnan_sign;     
     vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex3_qnan_f[51:0];
@@ -607,6 +669,8 @@ begin
     vfdsu_ex3_id_srt_skip    <=  vfdsu_ex3_id_srt_skip;
     vfdsu_ex3_double          <= vfdsu_ex3_double;
     vfdsu_ex3_single          <= vfdsu_ex3_single;
+    vfdsu_ex3_half            <= vfdsu_ex3_half;
+    vfdsu_ex3_bfloat          <= vfdsu_ex3_bfloat;
   end
 end
 assign vfdsu_ex3_rem_zero       =  ~|srt_remainder[60:0];
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
index f8846255..28ca2595 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
@@ -99,6 +99,8 @@ wire            ex1_double;
 wire            ex1_pipedown;                 
 wire            ex1_scalar;                   
 wire            ex1_single;                   
+wire            ex1_half;
+wire            ex1_bfloat;
 wire            ex1_sqrt;                     
 wire    [63:0]  ex1_src0;                     
 wire    [63:0]  ex1_src1;                     
@@ -128,6 +130,8 @@ wire            vfdsu_dp_fdiv_busy;
 wire            vfdsu_dp_inst_wb_req;         
 wire            vfdsu_ex2_double;             
 wire            vfdsu_ex2_single;             
+wire            vfdsu_ex2_half;
+wire            vfdsu_ex2_bfloat;
 wire            vfdsu_ifu_debug_ex2_wait;     
 wire            vfdsu_ifu_debug_idle;         
 wire            vfdsu_ifu_debug_pipe_busy;    
@@ -234,6 +238,8 @@ ct_vfdsu_ctrl  x_ct_vfdsu_ctrl (
   .ex1_double                  (ex1_double                 ),
   .ex1_pipedown                (ex1_pipedown               ),
   .ex1_single                  (ex1_single                 ),
+  .ex1_half                    (ex1_half                   ),
+  .ex1_bfloat                  (ex1_bfloat                 ),
   .ex2_data_clk                (ex2_data_clk               ),
   .ex2_pipedown                (ex2_pipedown               ),
   .ex2_srt_first_round         (ex2_srt_first_round        ),
@@ -251,6 +257,8 @@ ct_vfdsu_ctrl  x_ct_vfdsu_ctrl (
   .vfdsu_dp_inst_wb_req        (vfdsu_dp_inst_wb_req       ),
   .vfdsu_ex2_double            (vfdsu_ex2_double           ),
   .vfdsu_ex2_single            (vfdsu_ex2_single           ),
+  .vfdsu_ex2_half              (vfdsu_ex2_half             ),
+  .vfdsu_ex2_bfloat            (vfdsu_ex2_bfloat           ),
   .vfdsu_ifu_debug_ex2_wait    (vfdsu_ifu_debug_ex2_wait   ),
   .vfdsu_ifu_debug_idle        (vfdsu_ifu_debug_idle       ),
   .vfdsu_ifu_debug_pipe_busy   (vfdsu_ifu_debug_pipe_busy  )
@@ -266,6 +274,8 @@ ct_vfdsu_double  x_ct_vfdsu_double (
   .ex1_pipedown        (ex1_pipedown       ),
   .ex1_scalar          (ex1_scalar         ),
   .ex1_single          (ex1_single         ),
+  .ex1_half            (ex1_half           ),
+  .ex1_bfloat          (ex1_bfloat         ),
   .ex1_sqrt            (ex1_sqrt           ),
   .ex1_src0            (ex1_src0           ),
   .ex1_src1            (ex1_src1           ),
@@ -302,6 +312,8 @@ ct_vfdsu_scalar_dp  x_ct_vfdsu_scalar_dp (
   .ex1_pipedown                  (ex1_pipedown                 ),
   .ex1_scalar                    (ex1_scalar                   ),
   .ex1_single                    (ex1_single                   ),
+  .ex1_half                      (ex1_half                     ),
+  .ex1_bfloat                    (ex1_bfloat                   ),
   .ex1_sqrt                      (ex1_sqrt                     ),
   .ex1_src0                      (ex1_src0                     ),
   .ex1_src1                      (ex1_src1                     ),
@@ -321,7 +333,9 @@ ct_vfdsu_scalar_dp  x_ct_vfdsu_scalar_dp (
   .pipex_dp_vfdsu_freg_data      (pipex_dp_vfdsu_freg_data     ),
   .pipex_dp_vfdsu_vreg           (pipex_dp_vfdsu_vreg          ),
   .vfdsu_ex2_double              (vfdsu_ex2_double             ),
-  .vfdsu_ex2_single              (vfdsu_ex2_single             )
+  .vfdsu_ex2_single              (vfdsu_ex2_single             ),
+  .vfdsu_ex2_half                (vfdsu_ex2_half               ),
+  .vfdsu_ex2_bfloat              (vfdsu_ex2_bfloat             )
 );
 
 
diff --git a/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch b/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch
new file mode 100644
index 00000000..7d1ce903
--- /dev/null
+++ b/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch
@@ -0,0 +1,1359 @@
+From 032de47f043e3fe1dcb34c52363f7cb837681b33 Mon Sep 17 00:00:00 2001
+From: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch>
+Date: Mon, 24 Jun 2024 17:30:43 +0200
+Subject: [PATCH] Add FP16ALT support to THMULTI DivSqrt unit
+
+---
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v              |  21 ++-
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_double.v            |  29 ++++
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v              |  65 +++++++--
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v           |  96 +++++++++----
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_round.v             | 152 ++++++++++++++++++---
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v         |  24 +++-
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v               |  88 ++++++++++--
+ .../gen_rtl/vfdsu/rtl/ct_vfdsu_top.v               |  16 ++-
+ 8 files changed, 423 insertions(+), 68 deletions(-)
+
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
+index f7f541f..0aba4f1 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_ctrl.v
+@@ -26,6 +26,8 @@ module ct_vfdsu_ctrl(
+   ex1_double,
+   ex1_pipedown,
+   ex1_single,
++  ex1_half,
++  ex1_bfloat,
+   ex2_data_clk,
+   ex2_pipedown,
+   ex2_srt_first_round,
+@@ -43,6 +45,8 @@ module ct_vfdsu_ctrl(
+   vfdsu_dp_inst_wb_req,
+   vfdsu_ex2_double,
+   vfdsu_ex2_single,
++  vfdsu_ex2_half,
++  vfdsu_ex2_bfloat,
+   vfdsu_ifu_debug_ex2_wait,
+   vfdsu_ifu_debug_idle,
+   vfdsu_ifu_debug_pipe_busy
+@@ -57,6 +61,8 @@ input          dp_vfdsu_fdiv_gateclk_issue;
+ input          dp_vfdsu_idu_fdiv_issue;    
+ input          ex1_double;                 
+ input          ex1_single;                 
++input          ex1_half;
++input          ex1_bfloat;
+ input          forever_cpuclk;             
+ input          pad_yy_icg_scan_en;         
+ input          rtu_yy_xx_flush;            
+@@ -64,6 +70,8 @@ input          srt_ctrl_rem_zero;
+ input          srt_ctrl_skip_srt;          
+ input          vfdsu_ex2_double;           
+ input          vfdsu_ex2_single;           
++input          vfdsu_ex2_half;
++input          vfdsu_ex2_bfloat;
+ output         ex1_data_clk;               
+ output         ex1_pipedown;               
+ output         ex2_data_clk;               
+@@ -106,6 +114,8 @@ wire           ex1_data_clk_en;
+ wire           ex1_double;                 
+ wire           ex1_pipedown;               
+ wire           ex1_single;                 
++wire           ex1_half;
++wire           ex1_bfloat;
+ wire           ex2_data_clk;               
+ wire           ex2_data_clk_en;            
+ wire           ex2_pipe_clk;               
+@@ -137,6 +147,8 @@ wire           vfdsu_dp_fdiv_busy;
+ wire           vfdsu_dp_inst_wb_req;       
+ wire           vfdsu_ex2_double;           
+ wire           vfdsu_ex2_single;           
++wire           vfdsu_ex2_half;
++wire           vfdsu_ex2_bfloat;
+ wire           vfdsu_ex2_vld;              
+ wire           vfdsu_ifu_debug_ex2_wait;   
+ wire           vfdsu_ifu_debug_idle;       
+@@ -244,8 +256,9 @@ end
+ //For Double, initial is 5'b11100('d28), calculate 29 round
+ //For Single, initial is 5'b01110('d14), calculate 15 round
+ assign srt_cnt_ini[4:0] = (ex1_double) ? 5'b01101 :
+-                           ex1_single  ? 5'b00110
+-                                       : 5'b00011;
++                          (ex1_single) ? 5'b00110 :
++                          (ex1_half)   ? 5'b00011
++                                       : 5'b00010;
+ 
+ //vfdsu ex2 pipedown signal
+ assign ex2_pipedown = srt_last_round && div_st_ex2;
+@@ -277,7 +290,9 @@ assign srt_secd_round  = ex2_srt_secd_round;
+ 
+ assign ex2_srt_secd_round_pre  = srt_sm_on && srt_secd_round_pre;
+ assign srt_secd_round_pre      = vfdsu_ex2_double ? srt_cnt[4:0]==5'b01101 : 
+-                                 vfdsu_ex2_single ? srt_cnt[4:0]==5'b00110 : srt_cnt[4:0] == 5'b00011;
++                                 vfdsu_ex2_single ? srt_cnt[4:0]==5'b00110 :
++                                 vfdsu_ex2_half   ? srt_cnt[4:0]==5'b00011
++                                                  : srt_cnt[4:0]==5'b00010;
+ 
+ //==========================================================
+ //              EX3 Stage Control Signal
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
+index b57e289..ccd34f9 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_double.v
+@@ -24,6 +24,8 @@ module ct_vfdsu_double(
+   ex1_pipedown,
+   ex1_scalar,
+   ex1_single,
++  ex1_half,
++  ex1_bfloat,
+   ex1_sqrt,
+   ex1_src0,
+   ex1_src1,
+@@ -52,6 +54,8 @@ input           ex1_double;
+ input           ex1_pipedown;                         
+ input           ex1_scalar;                           
+ input           ex1_single;                           
++input           ex1_half;
++input           ex1_bfloat;
+ input           ex1_sqrt;                             
+ input   [63:0]  ex1_src0;                             
+ input   [63:0]  ex1_src1;                             
+@@ -83,6 +87,8 @@ wire            ex1_pipedown;
+ wire    [59:0]  ex1_remainder;                        
+ wire            ex1_scalar;                           
+ wire            ex1_single;                           
++wire            ex1_half;
++wire            ex1_bfloat;
+ wire            ex1_sqrt;                             
+ wire    [63:0]  ex1_src0;                             
+ wire    [63:0]  ex1_src1;                             
+@@ -116,12 +122,15 @@ wire            vfdsu_ex2_result_sign;
+ wire            vfdsu_ex2_result_zero;                
+ wire    [2 :0]  vfdsu_ex2_rm;                         
+ wire            vfdsu_ex2_single;                     
++wire            vfdsu_ex2_half;
++wire            vfdsu_ex2_bfloat;
+ wire            vfdsu_ex2_sqrt;                       
+ wire            vfdsu_ex2_srt_skip;                   
+ wire    [12:0]  vfdsu_ex3_doub_expnt_rst;             
+ wire            vfdsu_ex3_double;                     
+ wire            vfdsu_ex3_dz;                         
+ wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
++wire    [12:0]  vfdsu_ex3_bfloat_expnt_rst;
+ wire            vfdsu_ex3_id_srt_skip;                
+ wire            vfdsu_ex3_nv;                         
+ wire            vfdsu_ex3_of;                         
+@@ -141,6 +150,8 @@ wire    [2 :0]  vfdsu_ex3_rm;
+ wire            vfdsu_ex3_rslt_denorm;                
+ wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+ wire            vfdsu_ex3_single;                     
++wire            vfdsu_ex3_half;
++wire            vfdsu_ex3_bfloat;
+ wire            vfdsu_ex3_uf;                         
+ wire            vfdsu_ex4_denorm_to_tiny_frac;        
+ wire            vfdsu_ex4_double;                     
+@@ -164,6 +175,8 @@ wire            vfdsu_ex4_result_sign;
+ wire            vfdsu_ex4_result_zero;                
+ wire            vfdsu_ex4_rslt_denorm;                
+ wire            vfdsu_ex4_single;                     
++wire            vfdsu_ex4_half;
++wire            vfdsu_ex4_bfloat;
+ wire            vfdsu_ex4_uf;                         
+ wire            vfpu_yy_xx_dqnan;                     
+ wire    [2 :0]  vfpu_yy_xx_rm;                        
+@@ -181,6 +194,8 @@ ct_vfdsu_prepare  x_ct_vfdsu_prepare (
+   .ex1_remainder         (ex1_remainder        ),
+   .ex1_scalar            (ex1_scalar           ),
+   .ex1_single            (ex1_single           ),
++  .ex1_half              (ex1_half             ),
++  .ex1_bfloat            (ex1_bfloat           ),
+   .ex1_sqrt              (ex1_sqrt             ),
+   .ex1_src0              (ex1_src0             ),
+   .ex1_src1              (ex1_src1             ),
+@@ -204,6 +219,8 @@ ct_vfdsu_prepare  x_ct_vfdsu_prepare (
+   .vfdsu_ex2_result_zero (vfdsu_ex2_result_zero),
+   .vfdsu_ex2_rm          (vfdsu_ex2_rm         ),
+   .vfdsu_ex2_single      (vfdsu_ex2_single     ),
++  .vfdsu_ex2_half        (vfdsu_ex2_half       ),
++  .vfdsu_ex2_bfloat      (vfdsu_ex2_bfloat     ),
+   .vfdsu_ex2_sqrt        (vfdsu_ex2_sqrt       ),
+   .vfdsu_ex2_srt_skip    (vfdsu_ex2_srt_skip   ),
+   .vfpu_yy_xx_dqnan      (vfpu_yy_xx_dqnan     ),
+@@ -246,12 +263,15 @@ ct_vfdsu_srt  x_ct_vfdsu_srt (
+   .vfdsu_ex2_result_zero                 (vfdsu_ex2_result_zero                ),
+   .vfdsu_ex2_rm                          (vfdsu_ex2_rm                         ),
+   .vfdsu_ex2_single                      (vfdsu_ex2_single                     ),
++  .vfdsu_ex2_half                        (vfdsu_ex2_half                       ),
++  .vfdsu_ex2_bfloat                      (vfdsu_ex2_bfloat                     ),
+   .vfdsu_ex2_sqrt                        (vfdsu_ex2_sqrt                       ),
+   .vfdsu_ex2_srt_skip                    (vfdsu_ex2_srt_skip                   ),
+   .vfdsu_ex3_doub_expnt_rst              (vfdsu_ex3_doub_expnt_rst             ),
+   .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
+   .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
+   .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
++  .vfdsu_ex3_bfloat_expnt_rst            (vfdsu_ex3_bfloat_expnt_rst           ),
+   .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
+   .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
+   .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
+@@ -271,6 +291,8 @@ ct_vfdsu_srt  x_ct_vfdsu_srt (
+   .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
+   .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
+   .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
++  .vfdsu_ex3_half                        (vfdsu_ex3_half                       ),
++  .vfdsu_ex3_bfloat                      (vfdsu_ex3_bfloat                     ),
+   .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         )
+ );
+ 
+@@ -288,6 +310,7 @@ ct_vfdsu_round  x_ct_vfdsu_round (
+   .vfdsu_ex3_double                      (vfdsu_ex3_double                     ),
+   .vfdsu_ex3_dz                          (vfdsu_ex3_dz                         ),
+   .vfdsu_ex3_half_expnt_rst              (vfdsu_ex3_half_expnt_rst             ),
++  .vfdsu_ex3_bfloat_expnt_rst            (vfdsu_ex3_bfloat_expnt_rst           ),
+   .vfdsu_ex3_id_srt_skip                 (vfdsu_ex3_id_srt_skip                ),
+   .vfdsu_ex3_nv                          (vfdsu_ex3_nv                         ),
+   .vfdsu_ex3_of                          (vfdsu_ex3_of                         ),
+@@ -307,6 +330,8 @@ ct_vfdsu_round  x_ct_vfdsu_round (
+   .vfdsu_ex3_rslt_denorm                 (vfdsu_ex3_rslt_denorm                ),
+   .vfdsu_ex3_sing_expnt_rst              (vfdsu_ex3_sing_expnt_rst             ),
+   .vfdsu_ex3_single                      (vfdsu_ex3_single                     ),
++  .vfdsu_ex3_half                        (vfdsu_ex3_half                       ),
++  .vfdsu_ex3_bfloat                      (vfdsu_ex3_bfloat                     ),
+   .vfdsu_ex3_uf                          (vfdsu_ex3_uf                         ),
+   .vfdsu_ex4_denorm_to_tiny_frac         (vfdsu_ex4_denorm_to_tiny_frac        ),
+   .vfdsu_ex4_double                      (vfdsu_ex4_double                     ),
+@@ -330,6 +355,8 @@ ct_vfdsu_round  x_ct_vfdsu_round (
+   .vfdsu_ex4_result_zero                 (vfdsu_ex4_result_zero                ),
+   .vfdsu_ex4_rslt_denorm                 (vfdsu_ex4_rslt_denorm                ),
+   .vfdsu_ex4_single                      (vfdsu_ex4_single                     ),
++  .vfdsu_ex4_half                        (vfdsu_ex4_half                       ),
++  .vfdsu_ex4_bfloat                      (vfdsu_ex4_bfloat                     ),
+   .vfdsu_ex4_uf                          (vfdsu_ex4_uf                         )
+ );
+ 
+@@ -359,6 +386,8 @@ ct_vfdsu_pack  x_ct_vfdsu_pack (
+   .vfdsu_ex4_result_zero         (vfdsu_ex4_result_zero        ),
+   .vfdsu_ex4_rslt_denorm         (vfdsu_ex4_rslt_denorm        ),
+   .vfdsu_ex4_single              (vfdsu_ex4_single             ),
++  .vfdsu_ex4_half                (vfdsu_ex4_half               ),
++  .vfdsu_ex4_bfloat              (vfdsu_ex4_bfloat             ),
+   .vfdsu_ex4_uf                  (vfdsu_ex4_uf                 )
+ );
+ 
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
+index e1d2e18..b29c70f 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_pack.v
+@@ -39,6 +39,8 @@ module ct_vfdsu_pack(
+   vfdsu_ex4_result_zero,
+   vfdsu_ex4_rslt_denorm,
+   vfdsu_ex4_single,
++  vfdsu_ex4_half,
++  vfdsu_ex4_bfloat,
+   vfdsu_ex4_uf
+ );
+ 
+@@ -65,6 +67,8 @@ input           vfdsu_ex4_result_sign;
+ input           vfdsu_ex4_result_zero;        
+ input           vfdsu_ex4_rslt_denorm;        
+ input           vfdsu_ex4_single;             
++input           vfdsu_ex4_half;
++input           vfdsu_ex4_bfloat;
+ input           vfdsu_ex4_uf;                 
+ output  [4 :0]  ex4_out_expt;                 
+ output  [63:0]  ex4_out_result;               
+@@ -73,6 +77,7 @@ output  [63:0]  ex4_out_result;
+ reg     [51:0]  ex4_denorm_frac;              
+ reg     [51:0]  ex4_frac_52;                  
+ reg     [51:0]  ex4_half_denorm_frac;         
++reg     [51:0]  ex4_bfloat_denorm_frac;
+ reg     [63:0]  ex4_out_result;               
+ reg     [51:0]  ex4_single_denorm_frac;       
+ reg     [12:0]  expnt_add_op1;                
+@@ -95,6 +100,11 @@ wire    [63:0]  ex4_half_rst0;
+ wire    [63:0]  ex4_half_rst_inf;             
+ wire    [63:0]  ex4_half_rst_norm;            
+ wire    [63:0]  ex4_half_rst_qnan;            
++wire    [63:0]  ex4_bfloat_lfn;
++wire    [63:0]  ex4_bfloat_rst0;
++wire    [63:0]  ex4_bfloat_rst_inf;
++wire    [63:0]  ex4_bfloat_rst_norm;
++wire    [63:0]  ex4_bfloat_rst_qnan;
+ wire            ex4_of_plus;                  
+ wire    [4 :0]  ex4_out_expt;                 
+ wire            ex4_result_inf;               
+@@ -134,6 +144,8 @@ wire            vfdsu_ex4_result_sign;
+ wire            vfdsu_ex4_result_zero;        
+ wire            vfdsu_ex4_rslt_denorm;        
+ wire            vfdsu_ex4_single;             
++wire            vfdsu_ex4_half;
++wire            vfdsu_ex4_bfloat;
+ wire            vfdsu_ex4_uf;                 
+ 
+ 
+@@ -276,6 +288,24 @@ case(vfdsu_ex4_expnt_rst[12:0])
+ endcase                                                                  
+ // &CombEnd; @147
+ end
++// &CombBeg; @132
++always @( vfdsu_ex4_expnt_rst[12:0]
++       or ex4_frac[54:1]
++       or vfdsu_ex4_denorm_to_tiny_frac)
++begin
++case(vfdsu_ex4_expnt_rst[12:0])
++  13'h1:   ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[52:1]}; //-1022 1
++  13'h0:   ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[53:2]}; //-1023 0
++  13'h1fff:ex4_bfloat_denorm_frac[51:0] = {      ex4_frac[54:3]}; //-1024 -1
++  13'h1ffe:ex4_bfloat_denorm_frac[51:0] = {1'b0, ex4_frac[54:4]}; //-1025 -2
++  13'h1ffd:ex4_bfloat_denorm_frac[51:0] = {2'b0, ex4_frac[54:5]}; //-1026 -3
++  13'h1ffc:ex4_bfloat_denorm_frac[51:0] = {3'b0, ex4_frac[54:6]}; //-1027 -4
++  13'h1ffb:ex4_bfloat_denorm_frac[51:0] = {4'b0, ex4_frac[54:7]}; //-1028 -5
++  13'h1ffa:ex4_bfloat_denorm_frac[51:0] = {5'b0, ex4_frac[54:8]}; //-1029 -6
++  default :ex4_bfloat_denorm_frac[51:0] = vfdsu_ex4_denorm_to_tiny_frac ?{7'b1,45'b0} : 52'b0; //-1045
++endcase
++// &CombEnd; @147
++end
+ 
+ //here when denormal number round to add1, it will become normal number
+ assign ex4_denorm_potnt_norm    = (vfdsu_ex4_potnt_norm[1] && ex4_frac[53]) || 
+@@ -286,9 +316,11 @@ assign ex4_rslt_denorm          = !vfdsu_ex4_result_qnan
+ assign ex4_denorm_result[63:0]  = vfdsu_ex4_double ? 
+                                   {vfdsu_ex4_result_sign,11'h0,ex4_denorm_frac[51:0]} :
+                                   vfdsu_ex4_single ? {32'hffffffff,vfdsu_ex4_result_sign,
+-                                        8'h0,ex4_single_denorm_frac[51:29]}  : {
+-                                        48'hffffffffffff,vfdsu_ex4_result_sign,5'h0,
+-                                        ex4_half_denorm_frac[51:42]};
++                                        8'h0,ex4_single_denorm_frac[51:29]}  :
++                                  vfdsu_ex4_half ? {48'hffffffffffff,vfdsu_ex4_result_sign,5'h0,
++                                        ex4_half_denorm_frac[51:42]}
++                                                 : {48'hffffffffffff,vfdsu_ex4_result_sign,8'h0,
++                                        ex4_bfloat_denorm_frac[51:45]};
+ 
+                                
+ 
+@@ -299,6 +331,15 @@ assign ex4_half_rst_norm[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,
+                                   ex4_expnt_rst[4:0],
+                                   ex4_frac_52[51:42]};
+ assign ex4_half_rst0[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,15'h0};                                
++
++assign ex4_bfloat_lfn[63:0]      = {48'hffffffffffff,vfdsu_ex4_result_sign,8'hfe,{7{1'b1}}};
++assign ex4_bfloat_rst_qnan[63:0] = {48'hffffffffffff,vfdsu_ex4_qnan_sign, 8'hff,1'b1, vfdsu_ex4_qnan_f[5:0]};
++assign ex4_bfloat_rst_inf[63:0]  = {48'hffffffffffff,vfdsu_ex4_result_sign,8'hff,7'b0};
++assign ex4_bfloat_rst_norm[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,
++                                  ex4_expnt_rst[7:0],
++                                  ex4_frac_52[51:45]};
++assign ex4_bfloat_rst0[63:0] = {48'hffffffffffff,vfdsu_ex4_result_sign,15'h0};
++
+ //ex4 overflow/underflow plus                                 
+ assign ex4_rst_nor = vfdsu_ex4_result_nor;                    
+ assign ex4_of_plus = vfdsu_ex4_potnt_of  && 
+@@ -345,21 +386,23 @@ assign ex4_sing_rst_norm[63:0] = {32'hffffffff,vfdsu_ex4_result_sign,
+                                   ex4_expnt_rst[7:0],
+                                   ex4_frac_52[51:29]};
+ assign ex4_rst_lfn[63:0]       = (vfdsu_ex4_double) ? ex4_doub_lfn[63:0] :
+-                                  vfdsu_ex4_single  ? ex4_sing_lfn[63:0] : ex4_half_lfn[63:0];
++                                  vfdsu_ex4_single  ? ex4_sing_lfn[63:0] :
++                                  vfdsu_ex4_half    ? ex4_half_lfn[63:0] : ex4_bfloat_lfn[63:0];
+ 
+ assign ex4_rst0[63:0]          = (vfdsu_ex4_double) ? ex4_doub_rst0[63:0] :
+-                                  vfdsu_ex4_single  ? ex4_sing_rst0[63:0] : ex4_half_rst0[63:0];
++                                  vfdsu_ex4_single  ? ex4_sing_rst0[63:0] :
++                                  vfdsu_ex4_half    ? ex4_half_rst0[63:0] : ex4_bfloat_rst0[63:0];
+ 
+ assign ex4_rst_qnan[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_qnan[63:0] :
+-                                  vfdsu_ex4_single  ? ex4_sing_rst_qnan[63:0] 
+-                                                    : ex4_half_rst_qnan[63:0];
++                                  vfdsu_ex4_single  ? ex4_sing_rst_qnan[63:0] :
++                                  vfdsu_ex4_half    ? ex4_half_rst_qnan[63:0] : ex4_bfloat_rst_qnan[63:0];
+ 
+ assign ex4_rst_norm[63:0]      = (vfdsu_ex4_double) ? ex4_doub_rst_norm[63:0] :
+-                                  vfdsu_ex4_single  ? ex4_sing_rst_norm[63:0]
+-                                                    : ex4_half_rst_norm[63:0];
++                                  vfdsu_ex4_single  ? ex4_sing_rst_norm[63:0] :
++                                  vfdsu_ex4_half    ? ex4_half_rst_norm[63:0] : ex4_bfloat_rst_norm[63:0];
+ assign ex4_rst_inf[63:0]       = (vfdsu_ex4_double) ? ex4_doub_rst_inf[63:0] :
+-                                  vfdsu_ex4_single  ? ex4_sing_rst_inf[63:0]
+-                                                    : ex4_half_rst_inf[63:0];
++                                  vfdsu_ex4_single  ? ex4_sing_rst_inf[63:0] :
++                                  vfdsu_ex4_half    ? ex4_half_rst_inf[63:0] : ex4_bfloat_rst_inf[63:0];
+ 
+       
+ assign ex4_cor_uf            = (vfdsu_ex4_uf && !ex4_denorm_potnt_norm || ex4_uf_plus)
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
+index 7c5821c..0ef958a 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_prepare.v
+@@ -25,6 +25,8 @@ module ct_vfdsu_prepare(
+   ex1_remainder,
+   ex1_scalar,
+   ex1_single,
++  ex1_half,
++  ex1_bfloat,
+   ex1_sqrt,
+   ex1_src0,
+   ex1_src1,
+@@ -48,6 +50,8 @@ module ct_vfdsu_prepare(
+   vfdsu_ex2_result_zero,
+   vfdsu_ex2_rm,
+   vfdsu_ex2_single,
++  vfdsu_ex2_half,
++  vfdsu_ex2_bfloat,
+   vfdsu_ex2_sqrt,
+   vfdsu_ex2_srt_skip,
+   vfpu_yy_xx_dqnan,
+@@ -63,6 +67,8 @@ input           ex1_double;
+ input           ex1_pipedown;             
+ input           ex1_scalar;               
+ input           ex1_single;               
++input           ex1_half;
++input           ex1_bfloat;
+ input           ex1_sqrt;                 
+ input   [63:0]  ex1_src0;                 
+ input   [63:0]  ex1_src1;                 
+@@ -90,6 +96,8 @@ output          vfdsu_ex2_result_sign;
+ output          vfdsu_ex2_result_zero;    
+ output  [2 :0]  vfdsu_ex2_rm;             
+ output          vfdsu_ex2_single;         
++output          vfdsu_ex2_half;
++output          vfdsu_ex2_bfloat;
+ output          vfdsu_ex2_sqrt;           
+ output          vfdsu_ex2_srt_skip;       
+ 
+@@ -115,6 +123,8 @@ reg             vfdsu_ex2_result_sign;
+ reg             vfdsu_ex2_result_zero;    
+ reg     [2 :0]  vfdsu_ex2_rm;             
+ reg             vfdsu_ex2_single;         
++reg             vfdsu_ex2_half;
++reg             vfdsu_ex2_bfloat;
+ reg             vfdsu_ex2_sqrt;           
+ reg             vfdsu_ex2_srt_skip;       
+ 
+@@ -161,6 +171,12 @@ wire            ex1_half_expnt1_max;
+ wire            ex1_half_expnt1_zero;     
+ wire            ex1_half_frac0_all0;      
+ wire            ex1_half_frac1_all0;      
++wire            ex1_bfloat_expnt0_max;
++wire            ex1_bfloat_expnt1_max;
++wire            ex1_bfloat_expnt0_zero;
++wire            ex1_bfloat_expnt1_zero;
++wire            ex1_bfloat_frac0_all0;
++wire            ex1_bfloat_frac1_all0;
+ wire            ex1_nv;                   
+ wire            ex1_op0_cnan;             
+ wire    [51:0]  ex1_op0_f;                
+@@ -216,6 +232,8 @@ wire            ex1_sing_expnt1_zero;
+ wire            ex1_sing_frac0_all0;      
+ wire            ex1_sing_frac1_all0;      
+ wire            ex1_single;               
++wire            ex1_half;
++wire            ex1_bfloat;
+ wire            ex1_sqrt;                 
+ wire            ex1_sqrt_expnt_odd;       
+ wire            ex1_sqrt_expnt_result_odd; 
+@@ -246,9 +264,11 @@ assign ex1_oper1[63:0]             = ex1_src1[63:0];
+ 
+ //Sign bit prepare
+ assign ex1_op0_sign                =  ex1_double ? ex1_oper0[63] :
+-                                      ex1_single ? ex1_oper0[31] : ex1_oper0[15]; 
++                                      ex1_single ? ex1_oper0[31] :
++                                      ex1_half   ? ex1_oper0[15] : ex1_oper0[15];
+ assign ex1_op1_sign                =  ex1_double ? ex1_oper1[63] :
+-                                      ex1_single ? ex1_oper1[31] : ex1_oper1[15]; 
++                                      ex1_single ? ex1_oper1[31] :
++                                      ex1_half   ? ex1_oper1[15] : ex1_oper1[15];
+ assign div_sign                    = ex1_op0_sign ^ ex1_op1_sign;
+ assign sqrt_sign                   = ex1_op0_sign;
+ assign ex1_result_sign             = (ex1_div)
+@@ -261,10 +281,14 @@ assign ex1_doub_expnt1_max         = &ex1_oper1[62:52];
+ assign ex1_sing_expnt1_max         = &ex1_oper1[30:23];
+ assign ex1_half_expnt0_max         = &ex1_oper0[14:10];
+ assign ex1_half_expnt1_max         = &ex1_oper1[14:10];
++assign ex1_bfloat_expnt0_max       = &ex1_oper0[14:7];
++assign ex1_bfloat_expnt1_max       = &ex1_oper1[14:7];
+ assign ex1_expnt0_max              = ex1_double ? ex1_doub_expnt0_max :
+-                                     ex1_single ? ex1_sing_expnt0_max : ex1_half_expnt0_max;
++                                     ex1_single ? ex1_sing_expnt0_max :
++                                     ex1_half   ? ex1_half_expnt0_max : ex1_bfloat_expnt0_max;
+ assign ex1_expnt1_max              = ex1_double ? ex1_doub_expnt1_max :
+-                                     ex1_single ? ex1_sing_expnt1_max : ex1_half_expnt1_max;
++                                     ex1_single ? ex1_sing_expnt1_max :
++                                     ex1_half   ? ex1_half_expnt1_max : ex1_bfloat_expnt1_max;
+              
+ //exponent zero
+ assign ex1_doub_expnt0_zero        = ~|ex1_oper0[62:52];
+@@ -273,10 +297,15 @@ assign ex1_doub_expnt1_zero        = ~|ex1_oper1[62:52];
+ assign ex1_sing_expnt1_zero        = ~|ex1_oper1[30:23];
+ assign ex1_half_expnt0_zero        = ~|ex1_oper0[14:10];
+ assign ex1_half_expnt1_zero        = ~|ex1_oper1[14:10];
++assign ex1_bfloat_expnt0_zero      = ~|ex1_oper0[14:7];
++assign ex1_bfloat_expnt1_zero      = ~|ex1_oper1[14:7];
+ assign ex1_expnt0_zero             = ex1_double ? ex1_doub_expnt0_zero :
+-                                     ex1_single ? ex1_sing_expnt0_zero : ex1_half_expnt0_zero;
++                                     ex1_single ? ex1_sing_expnt0_zero :
++                                     ex1_half   ? ex1_half_expnt0_zero : ex1_bfloat_expnt0_zero;
+ assign ex1_expnt1_zero             = ex1_double ? ex1_doub_expnt1_zero :
+-                                     ex1_single ? ex1_sing_expnt1_zero : ex1_half_expnt1_zero; 
++                                     ex1_single ? ex1_sing_expnt1_zero :
++                                     ex1_half   ? ex1_half_expnt1_zero : ex1_bfloat_expnt1_zero;
++
+ //fraction zero
+ assign ex1_doub_frac0_all0         = ~|ex1_oper0[51:0];
+ assign ex1_sing_frac0_all0         = ~|ex1_oper0[22:0];
+@@ -284,14 +313,20 @@ assign ex1_doub_frac1_all0         = ~|ex1_oper1[51:0];
+ assign ex1_sing_frac1_all0         = ~|ex1_oper1[22:0];
+ assign ex1_half_frac0_all0         = ~|ex1_oper0[9:0];
+ assign ex1_half_frac1_all0         = ~|ex1_oper1[9:0];
++assign ex1_bfloat_frac0_all0       = ~|ex1_oper0[6:0];
++assign ex1_bfloat_frac1_all0       = ~|ex1_oper1[6:0];
+ assign ex1_frac0_all0              = ex1_double ? ex1_doub_frac0_all0 :
+-                                     ex1_single ? ex1_sing_frac0_all0 : ex1_half_frac0_all0;   
++                                     ex1_single ? ex1_sing_frac0_all0 :
++                                     ex1_half ?   ex1_half_frac0_all0 : ex1_bfloat_frac0_all0;
+ assign ex1_frac1_all0              = ex1_double ? ex1_doub_frac1_all0 :
+-                                     ex1_single ? ex1_sing_frac1_all0 : ex1_half_frac1_all0;   
++                                     ex1_single ? ex1_sing_frac1_all0 :
++                                     ex1_half ?   ex1_half_frac1_all0 : ex1_bfloat_frac1_all0;
+ assign ex1_frac0_msb               = ex1_double ? ex1_oper0[51] :
+-                                     ex1_single ? ex1_oper0[22] : ex1_oper0[9];
++                                     ex1_single ? ex1_oper0[22] :
++                                     ex1_half   ? ex1_oper0[9]  : ex1_oper0[6];
+ assign ex1_frac1_msb               = ex1_double ? ex1_oper1[51] :
+-                                     ex1_single ? ex1_oper1[22] : ex1_oper1[9]; 
++                                     ex1_single ? ex1_oper1[22] :
++                                     ex1_half   ? ex1_oper1[9]  : ex1_oper1[6];
+ assign ex1_oper0_high_all1         = ex1_single ? &ex1_oper0[63:32] : &ex1_oper0[63:16]; 
+ assign ex1_oper1_high_all1         = ex1_single ? &ex1_oper1[63:32] : &ex1_oper1[63:16];
+  
+@@ -382,25 +417,30 @@ ct_vfdsu_ff1  x_frac1_expnt (
+ // &Connect(.frac_bin_val(ex1_oper1_id_expnt[12:0])); @157
+ // &Connect(.fanc_shift_num(ex1_oper1_id_frac[51:0])); @158
+ assign ex1_oper0_frac[51:0] = ex1_double ? ex1_oper0[51:0] :
+-                                           ex1_single ? {ex1_oper0[22:0],29'b0}
+-                                                      : {ex1_oper0[9:0],42'b0};
++                                           ex1_single ? {ex1_oper0[22:0],29'b0} :
++                                           ex1_half   ? {ex1_oper0[9:0],42'b0}
++                                                      : {ex1_oper0[6:0],45'b0};
+ assign ex1_oper1_frac[51:0] = ex1_double ? ex1_oper1[51:0] :
+-                                           ex1_single ? {ex1_oper1[22:0],29'b0}
+-                                                      : {ex1_oper1[9:0],42'b0};
++                                           ex1_single ? {ex1_oper1[22:0],29'b0} :
++                                           ex1_half   ? {ex1_oper1[9:0],42'b0}
++                                                      : {ex1_oper1[6:0],45'b0};
+ //=====================exponent add=========================
+ //exponent number 0
+ assign ex1_div_op0_expnt[12:0]     = ex1_double ? {2'b0,ex1_oper0[62:52]} : 
+-                                                  ex1_single ? {5'b0,ex1_oper0[30:23]}
+-                                                             : {8'b0,ex1_oper0[14:10]};
++                                                  ex1_single ? {5'b0,ex1_oper0[30:23]} :
++                                                  ex1_half   ? {8'b0,ex1_oper0[14:10]}
++                                                             : {5'b0,ex1_oper0[14:7]};
+ assign ex1_expnt_adder_op0[12:0]   = ex1_op0_id_nor ? ex1_oper0_id_expnt[12:0]
+                                                     : ex1_div_op0_expnt[12:0];
+ //exponent number 1
+ assign ex1_div_op1_expnt[12:0]  = ex1_double ? {2'b0,ex1_oper1[62:52]} :
+-                                               ex1_single ? {5'b0,ex1_oper1[30:23]}
+-                                                          : {8'b0,ex1_oper1[14:10]};
++                                               ex1_single ? {5'b0,ex1_oper1[30:23]} :
++                                               ex1_half   ? {8'b0,ex1_oper1[14:10]}
++                                                          : {5'b0,ex1_oper1[14:7]};
+ assign ex1_sqrt_op1_expnt[12:0] = ex1_double ? {3'b0,{10{1'b1}}} : //'d1023
+-                                               ex1_single ? {6'b0,{7{1'b1}}} //'d127
+-                                                          : {9'b0,{4{1'b1}}}; //'d15
++                                               ex1_single ? {6'b0,{7{1'b1}}} ://'d127
++                                               ex1_half   ? {9'b0,{4{1'b1}}}  //'d15
++                                                          : {6'b0,{7{1'b1}}}; //'d127
+   
+ // &CombBeg;  @180
+ always @( ex1_oper1_id_expnt[12:0]
+@@ -569,11 +609,13 @@ assign ex1_div_srt_op0[52:0]     = ex1_div_nor_srt_op0[52:0];
+ assign ex1_div_srt_op1[52:0]     =  ex1_div_nor_srt_op1[52:0];
+ //ex1_div_nor_srt_op0
+ assign ex1_div_noid_nor_srt_op0[52:0] = ex1_double ? {1'b1,ex1_oper0[51:0]} :
+-                                                     ex1_single ? {1'b1,ex1_oper0[22:0],29'b0}
+-                                                                : {1'b1,ex1_oper0[9:0],42'b0};
++                                                     ex1_single ? {1'b1,ex1_oper0[22:0],29'b0} :
++                                                     ex1_half   ? {1'b1,ex1_oper0[9:0],42'b0}
++                                                                : {1'b1,ex1_oper0[6:0],45'b0};
+ assign ex1_div_noid_nor_srt_op1[52:0] = ex1_double ? {1'b1,ex1_oper1[51:0]} :
+-                                                     ex1_single ? {1'b1,ex1_oper1[22:0],29'b0}
+-                                                                : {1'b1,ex1_oper1[9:0],42'b0};
++                                                     ex1_single ? {1'b1,ex1_oper1[22:0],29'b0} :
++                                                     ex1_half   ? {1'b1,ex1_oper1[9:0],42'b0}
++                                                                : {1'b1,ex1_oper1[6:0],45'b0};
+ assign ex1_div_nor_srt_op0[52:0] = ex1_op0_id_nor ? {ex1_oper0_id_frac[51:0],1'b0} 
+                                                   : ex1_div_noid_nor_srt_op0[52:0];
+ //ex1_div_nor_srt_op1
+@@ -699,6 +741,8 @@ begin
+     vfdsu_ex2_sqrt            <=  1'b0;
+     vfdsu_ex2_double          <=  1'b0;
+     vfdsu_ex2_single          <=  1'b0;
++    vfdsu_ex2_half            <=  1'b0;
++    vfdsu_ex2_bfloat          <=  1'b0;
+   end
+   else if(ex1_pipedown)
+   begin
+@@ -721,6 +765,8 @@ begin
+     vfdsu_ex2_sqrt            <= ex1_sqrt;
+     vfdsu_ex2_double          <= ex1_double;
+     vfdsu_ex2_single          <= ex1_single;
++    vfdsu_ex2_half            <= ex1_half;
++    vfdsu_ex2_bfloat          <= ex1_bfloat;
+   end
+   else
+   begin
+@@ -743,6 +789,8 @@ begin
+     vfdsu_ex2_sqrt            <= vfdsu_ex2_sqrt;
+     vfdsu_ex2_double          <= vfdsu_ex2_double;
+     vfdsu_ex2_single          <= vfdsu_ex2_single;
++    vfdsu_ex2_half            <= vfdsu_ex2_half;
++    vfdsu_ex2_bfloat          <= vfdsu_ex2_bfloat;
+   end
+ end
+ 
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
+index 6eece52..a419289 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
+@@ -27,6 +27,7 @@ module ct_vfdsu_round(
+   vfdsu_ex3_double,
+   vfdsu_ex3_dz,
+   vfdsu_ex3_half_expnt_rst,
++  vfdsu_ex3_bfloat_expnt_rst,
+   vfdsu_ex3_id_srt_skip,
+   vfdsu_ex3_nv,
+   vfdsu_ex3_of,
+@@ -46,6 +47,8 @@ module ct_vfdsu_round(
+   vfdsu_ex3_rslt_denorm,
+   vfdsu_ex3_sing_expnt_rst,
+   vfdsu_ex3_single,
++  vfdsu_ex3_half,
++  vfdsu_ex3_bfloat,
+   vfdsu_ex3_uf,
+   vfdsu_ex4_denorm_to_tiny_frac,
+   vfdsu_ex4_double,
+@@ -69,6 +72,8 @@ module ct_vfdsu_round(
+   vfdsu_ex4_result_zero,
+   vfdsu_ex4_rslt_denorm,
+   vfdsu_ex4_single,
++  vfdsu_ex4_half,
++  vfdsu_ex4_bfloat,
+   vfdsu_ex4_uf
+ );
+ 
+@@ -85,6 +90,7 @@ input   [12:0]  vfdsu_ex3_doub_expnt_rst;
+ input           vfdsu_ex3_double;                     
+ input           vfdsu_ex3_dz;                         
+ input   [12:0]  vfdsu_ex3_half_expnt_rst;             
++input   [12:0]  vfdsu_ex3_bfloat_expnt_rst;
+ input           vfdsu_ex3_id_srt_skip;                
+ input           vfdsu_ex3_nv;                         
+ input           vfdsu_ex3_of;                         
+@@ -104,6 +110,8 @@ input   [2 :0]  vfdsu_ex3_rm;
+ input           vfdsu_ex3_rslt_denorm;                
+ input   [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+ input           vfdsu_ex3_single;                     
++input           vfdsu_ex3_half;
++input           vfdsu_ex3_bfloat;
+ input           vfdsu_ex3_uf;                         
+ output          vfdsu_ex4_denorm_to_tiny_frac;        
+ output          vfdsu_ex4_double;                     
+@@ -127,6 +135,8 @@ output          vfdsu_ex4_result_sign;
+ output          vfdsu_ex4_result_zero;                
+ output          vfdsu_ex4_rslt_denorm;                
+ output          vfdsu_ex4_single;                     
++output          vfdsu_ex4_half;
++output          vfdsu_ex4_bfloat;
+ output          vfdsu_ex4_uf;                         
+ 
+ // &Regs; @24
+@@ -138,8 +148,10 @@ reg             frac_orig;
+ reg     [54:0]  frac_sub1_op1;                        
+ reg             frac_sub_1;                           
+ reg             half_denorm_lst_frac;                 
++reg             bfloat_denorm_lst_frac;
+ reg     [56:0]  qt_result_double_denorm_for_round;    
+ reg     [13:0]  qt_result_half_denorm_for_round;      
++reg     [10:0]  qt_result_bfloat_denorm_for_round;
+ reg     [27:0]  qt_result_single_denorm_for_round;    
+ reg             single_denorm_lst_frac;               
+ reg             vfdsu_ex4_denorm_to_tiny_frac;        
+@@ -164,6 +176,8 @@ reg             vfdsu_ex4_result_sign;
+ reg             vfdsu_ex4_result_zero;                
+ reg             vfdsu_ex4_rslt_denorm;                
+ reg             vfdsu_ex4_single;                     
++reg             vfdsu_ex4_half;
++reg             vfdsu_ex4_bfloat;
+ reg             vfdsu_ex4_uf;                         
+ 
+ // &Wires; @25
+@@ -199,6 +213,16 @@ wire            ex3_half_gr;
+ wire            ex3_half_low_not_zero;                
+ wire            ex3_half_rst_eq_1;                    
+ wire            ex3_half_zero;                        
++wire            ex3_bfloat_denorm_eq;
++wire            ex3_bfloat_denorm_gr;
++wire            ex3_bfloat_denorm_plus;
++wire            ex3_bfloat_denorm_potnt_norm;
++wire            ex3_bfloat_denorm_zero;
++wire            ex3_bfloat_eq;
++wire            ex3_bfloat_gr;
++wire            ex3_bfloat_low_not_zero;
++wire            ex3_bfloat_rst_eq_1;
++wire            ex3_bfloat_zero;
+ wire            ex3_nx;                               
+ wire            ex3_pipe_clk;                         
+ wire            ex3_pipe_clk_en;                      
+@@ -210,6 +234,8 @@ wire            ex3_qt_eq;
+ wire            ex3_qt_gr;                            
+ wire            ex3_qt_half_lo2_not0;                 
+ wire            ex3_qt_half_lo3_not0;                 
++wire            ex3_qt_bfloat_lo2_not0;
++wire            ex3_qt_bfloat_lo3_not0;
+ wire            ex3_qt_sing_lo3_not0;                 
+ wire            ex3_qt_sing_lo4_not0;                 
+ wire            ex3_qt_zero;                          
+@@ -254,6 +280,7 @@ wire            vfdsu_ex3_double;
+ wire            vfdsu_ex3_dz;                         
+ wire    [12:0]  vfdsu_ex3_expnt_rst;                  
+ wire    [12:0]  vfdsu_ex3_half_expnt_rst;             
++wire    [12:0]  vfdsu_ex3_bfloat_expnt_rst;
+ wire            vfdsu_ex3_id_srt_skip;                
+ wire            vfdsu_ex3_nv;                         
+ wire            vfdsu_ex3_of;                         
+@@ -273,6 +300,8 @@ wire    [2 :0]  vfdsu_ex3_rm;
+ wire            vfdsu_ex3_rslt_denorm;                
+ wire    [8 :0]  vfdsu_ex3_sing_expnt_rst;             
+ wire            vfdsu_ex3_single;                     
++wire            vfdsu_ex3_half;
++wire            vfdsu_ex3_bfloat;
+ wire            vfdsu_ex3_uf;                         
+ 
+ 
+@@ -302,6 +331,22 @@ assign ex3_half_zero        = (total_qt_rt_58[56])
+ assign ex3_half_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:46];       
+ assign ex3_half_denorm_plus = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff2);
+ assign ex3_half_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1ff1);
++
++assign ex3_qt_bfloat_lo3_not0 = |total_qt_rt_58[47:45];
++assign ex3_qt_bfloat_lo2_not0 = |total_qt_rt_58[46:45];
++assign ex3_bfloat_gr       = total_qt_rt_58[56]
++                              ? total_qt_rt_58[48] && ex3_qt_bfloat_lo3_not0
++                              : total_qt_rt_58[47] && ex3_qt_bfloat_lo2_not0;
++assign ex3_bfloat_eq          = (total_qt_rt_58[56])
++                            ?  total_qt_rt_58[48] && !ex3_qt_sing_lo4_not0
++                            :  total_qt_rt_58[47] && !ex3_qt_sing_lo3_not0;
++assign ex3_bfloat_zero        = (total_qt_rt_58[56])
++                            ? ~|total_qt_rt_58[48:45]
++                            : ~|total_qt_rt_58[47:45];
++assign ex3_bfloat_rst_eq_1    = total_qt_rt_58[56] && ~|total_qt_rt_58[55:49];
++assign ex3_bfloat_denorm_plus = !total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f82);
++assign ex3_bfloat_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f81);
++
+ assign vfdsu_ex3_expnt_rst[12:0]  = vfdsu_ex3_half_expnt_rst[12:0];
+ // &Force("bus","total_qt_rt_58",57,0); @54
+ assign ex3_qt_doub_lo3_not0 = |total_qt_rt_58[2:0]; 
+@@ -343,19 +388,24 @@ assign ex3_doub_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[1
+ assign ex3_sing_denorm_potnt_norm = total_qt_rt_58[56] && (vfdsu_ex3_expnt_rst[12:0] == 13'h1f81);
+ assign ex3_rslt_denorm            = ex3_denorm_plus || vfdsu_ex3_rslt_denorm;
+ assign ex3_denorm_potnt_norm      = vfdsu_ex3_double ? ex3_doub_denorm_potnt_norm :
+-                                                       vfdsu_ex3_single ? ex3_sing_denorm_potnt_norm
+-                                                                        : ex3_half_denorm_potnt_norm;
++                                                       vfdsu_ex3_single ? ex3_sing_denorm_potnt_norm :
++                                                       vfdsu_ex3_half   ? ex3_half_denorm_potnt_norm
++                                                                        : ex3_bfloat_denorm_potnt_norm;
+ assign ex3_rst_eq_1         = (vfdsu_ex3_double)? ex3_doub_rst_eq_1 :
+-                               vfdsu_ex3_single ? ex3_sing_rst_eq_1 : ex3_half_rst_eq_1;
++                               vfdsu_ex3_single ? ex3_sing_rst_eq_1 :
++                               vfdsu_ex3_half   ? ex3_half_rst_eq_1 : ex3_bfloat_rst_eq_1;
+ assign ex3_qt_eq            = (vfdsu_ex3_double)? ex3_doub_eq :
+-                               vfdsu_ex3_single ? ex3_sing_eq : ex3_half_eq;
++                               vfdsu_ex3_single ? ex3_sing_eq :
++                               vfdsu_ex3_half   ? ex3_half_eq : ex3_bfloat_eq;
+ assign ex3_qt_gr            = (vfdsu_ex3_double)? ex3_doub_gr :
+-                               vfdsu_ex3_single ? ex3_sing_gr : ex3_half_gr;
++                               vfdsu_ex3_single ? ex3_sing_gr :
++                               vfdsu_ex3_half   ? ex3_half_gr : ex3_bfloat_gr;
+ assign ex3_qt_zero          = (vfdsu_ex3_double)? ex3_doub_zero :
+-                               vfdsu_ex3_single ? ex3_sing_zero : ex3_half_zero;
++                               vfdsu_ex3_single ? ex3_sing_zero :
++                               vfdsu_ex3_half   ? ex3_half_zero : ex3_bfloat_zero;
+ assign ex3_denorm_plus            = (vfdsu_ex3_double)  ? ex3_doub_denorm_plus 
+                                     : vfdsu_ex3_single ? ex3_sing_denorm_plus
+-                                                       : ex3_half_denorm_plus;
++                                    : vfdsu_ex3_half  ? ex3_half_denorm_plus : ex3_bfloat_denorm_plus;
+                              
+ // &CombBeg; @108
+ always @( vfdsu_ex3_doub_expnt_rst[12:0]
+@@ -682,14 +732,64 @@ assign ex3_half_denorm_gr      = qt_result_half_denorm_for_round[13]
+ assign ex3_half_denorm_zero    = !qt_result_half_denorm_for_round[13] 
+                                    && !ex3_half_low_not_zero;
+ 
++always @( vfdsu_ex3_bfloat_expnt_rst[8:0]
++       or total_qt_rt_58[56:45])
++begin
++case(vfdsu_ex3_bfloat_expnt_rst[8:0])
++  9'h182:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[48:45],7'b0}; //-126 1
++                bfloat_denorm_lst_frac =  total_qt_rt_58[49];
++          end//-1022 1
++  9'h181:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[49:45],6'b0}; //-127 0
++                bfloat_denorm_lst_frac =  total_qt_rt_58[50];
++          end//-1022 1
++  9'h180:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[50:45],5'b0}; //-128 -1
++                bfloat_denorm_lst_frac =  total_qt_rt_58[51];
++          end//-1022 1
++  9'h17f:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[51:45],4'b0}; //-129 -2
++                bfloat_denorm_lst_frac =  total_qt_rt_58[52];
++          end//-1022 1
++  9'h17e:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[52:45],3'b0}; //-90 -3
++                bfloat_denorm_lst_frac =  total_qt_rt_58[53];
++          end//-1022 1
++  9'h17d:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[53:45],2'b0}; //-91 -4
++                bfloat_denorm_lst_frac =  total_qt_rt_58[54];
++          end//-1022 1
++  9'h17c:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[54:45],1'b0}; //-92 -5
++                bfloat_denorm_lst_frac =  total_qt_rt_58[55];
++          end//-1022 1
++  9'h17b:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[55:45]}; //-93 -6
++                bfloat_denorm_lst_frac =  total_qt_rt_58[56];
++          end//-1022 1
++  9'h17a:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[56:46]}; //-93 -6
++                bfloat_denorm_lst_frac =  1'b0;
++          end//-1022 1
++  default:  begin qt_result_bfloat_denorm_for_round[10:0] = '0;
++                 bfloat_denorm_lst_frac = 1'b0;
++            end//-1022 1
++endcase
++// &CombEnd;  @363
++end
++//rounding evaluation for single denormalize number
++assign ex3_bfloat_denorm_eq      = qt_result_bfloat_denorm_for_round[10]
++                                   &&  !ex3_bfloat_low_not_zero;
++assign ex3_bfloat_low_not_zero   = |qt_result_bfloat_denorm_for_round[9:0];
++assign ex3_bfloat_denorm_gr      = qt_result_bfloat_denorm_for_round[10]
++                                   &&  ex3_bfloat_low_not_zero;
++assign ex3_bfloat_denorm_zero    = !qt_result_bfloat_denorm_for_round[10]
++                                   && !ex3_bfloat_low_not_zero;
++
+ assign ex3_denorm_eq             = vfdsu_ex3_double ? ex3_double_denorm_eq :
+-                                   vfdsu_ex3_single ? ex3_single_denorm_eq : ex3_half_denorm_eq;
++                                   vfdsu_ex3_single ? ex3_single_denorm_eq :
++                                   vfdsu_ex3_half   ? ex3_half_denorm_eq   : ex3_bfloat_denorm_eq;
+ assign ex3_denorm_gr             = vfdsu_ex3_double ? ex3_double_denorm_gr :
+-                                   vfdsu_ex3_single ? ex3_single_denorm_gr : ex3_half_denorm_gr;
++                                   vfdsu_ex3_single ? ex3_single_denorm_gr :
++                                   vfdsu_ex3_half   ? ex3_half_denorm_gr   : ex3_bfloat_denorm_gr;
+ assign ex3_denorm_zero           = vfdsu_ex3_double ? ex3_double_denorm_zero :
+-                                   vfdsu_ex3_single ? ex3_single_denorm_zero : ex3_half_denorm_zero;
++                                   vfdsu_ex3_single ? ex3_single_denorm_zero :
++                                   vfdsu_ex3_half   ? ex3_half_denorm_zero   : ex3_bfloat_denorm_zero;
+ assign ex3_denorm_lst_frac       = vfdsu_ex3_double ? double_denorm_lst_frac :
+-                                   vfdsu_ex3_single ? single_denorm_lst_frac : half_denorm_lst_frac;
++                                   vfdsu_ex3_single ? single_denorm_lst_frac :
++                                   vfdsu_ex3_half   ? half_denorm_lst_frac   : bfloat_denorm_lst_frac;
+   
+ //Different Round Mode with different rounding rule
+ //Here we call rounding bit as "rb", remainder as "rem"
+@@ -824,7 +924,9 @@ end
+ // &CombBeg; @540
+ always @( total_qt_rt_58[56]
+        or vfdsu_ex3_single
+-       or vfdsu_ex3_double)
++       or vfdsu_ex3_double
++       or vfdsu_ex3_half
++       or vfdsu_ex3_bfloat)
+ begin
+ case({total_qt_rt_58[56],vfdsu_ex3_double,vfdsu_ex3_single})
+   3'b001: 
+@@ -849,13 +951,23 @@ case({total_qt_rt_58[56],vfdsu_ex3_double,vfdsu_ex3_single})
+   end
+   3'b100:
+   begin
+-    frac_add1_op1[54:0] = {12'b1,43'b0};
+-    frac_sub1_op1[54:0] = {{12{1'b1}},43'b0};
++    if (vfdsu_ex3_half) begin
++      frac_add1_op1[54:0] = {12'b1,43'b0};
++      frac_sub1_op1[54:0] = {{12{1'b1}},43'b0};
++    end else begin
++      frac_add1_op1[54:0] = {9'b1,46'b0};
++      frac_sub1_op1[54:0] = {{9{1'b1}},46'b0};
++    end
+   end
+   3'b000:
+   begin
+-    frac_add1_op1[54:0] = {13'b1,42'b0};
+-    frac_sub1_op1[54:0] = {{13{1'b1}},42'b0};
++    if (vfdsu_ex3_half) begin
++      frac_add1_op1[54:0] = {13'b1,42'b0};
++      frac_sub1_op1[54:0] = {{13{1'b1}},42'b0};
++    end else begin
++      frac_add1_op1[54:0] = {10'b1,45'b0};
++      frac_sub1_op1[54:0] = {{10{1'b1}},45'b0};
++    end
+   end
+   default:
+   begin
+@@ -898,7 +1010,7 @@ assign ex3_nx      = ex3_rst_nor &&
+ assign ex3_denorm_nx = ex3_rslt_denorm && (!ex3_denorm_zero ||  !vfdsu_ex3_rem_zero);
+ //Adjust expnt
+ //Div:Actural expnt should plus 1 when op0 is id, sub 1 when op1 id
+-assign ex3_expnt_adjst[12:0] = vfdsu_ex3_double ? 13'h3ff: vfdsu_ex3_single ? 13'h7f : 13'hf;
++assign ex3_expnt_adjst[12:0] = vfdsu_ex3_double ? 13'h3ff: vfdsu_ex3_single ? 13'h7f : vfdsu_ex3_half ? 13'hf : 13'h7f;
+ assign ex3_expnt_adjust_result[12:0] = vfdsu_ex3_expnt_rst[12:0] + 
+                                        ex3_expnt_adjst[12:0];
+ //this information is for the packing, which determin the result is normal
+@@ -954,6 +1066,8 @@ begin
+     vfdsu_ex4_potnt_norm[1:0] <= 2'b0;
+     vfdsu_ex4_double          <= 1'b0;
+     vfdsu_ex4_single          <= 1'b0;
++    vfdsu_ex4_half            <= 1'b0;
++    vfdsu_ex4_bfloat          <= 1'b0;
+ 
+   end
+   else if(ex3_pipedown)
+@@ -982,6 +1096,8 @@ begin
+     vfdsu_ex4_potnt_norm[1:0] <= ex3_potnt_norm[1:0];
+     vfdsu_ex4_double          <= vfdsu_ex3_double;
+     vfdsu_ex4_single          <= vfdsu_ex3_single;
++    vfdsu_ex4_half            <= vfdsu_ex3_half;
++    vfdsu_ex4_bfloat          <= vfdsu_ex3_bfloat;
+   end
+   else
+   begin
+@@ -1009,6 +1125,8 @@ begin
+     vfdsu_ex4_potnt_norm[1:0] <= vfdsu_ex4_potnt_norm[1:0];
+     vfdsu_ex4_double          <= vfdsu_ex4_double;
+     vfdsu_ex4_single          <= vfdsu_ex4_single;
++    vfdsu_ex4_half            <= vfdsu_ex4_half;
++    vfdsu_ex4_bfloat          <= vfdsu_ex4_bfloat;
+   end  
+ end    
+ 
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
+index c7a679c..4d91a2c 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_scalar_dp.v
+@@ -30,6 +30,8 @@ module ct_vfdsu_scalar_dp(
+   ex1_double,
+   ex1_pipedown,
+   ex1_scalar,
++  ex1_half,
++  ex1_bfloat,
+   ex1_single,
+   ex1_sqrt,
+   ex1_src0,
+@@ -50,7 +52,9 @@ module ct_vfdsu_scalar_dp(
+   pipex_dp_vfdsu_freg_data,
+   pipex_dp_vfdsu_vreg,
+   vfdsu_ex2_double,
+-  vfdsu_ex2_single
++  vfdsu_ex2_single,
++  vfdsu_ex2_half,
++  vfdsu_ex2_bfloat
+ );
+ 
+ // &Ports; @24
+@@ -79,6 +83,8 @@ output          ex1_div;
+ output          ex1_double;                   
+ output          ex1_scalar;                   
+ output          ex1_single;                   
++output          ex1_half;
++output          ex1_bfloat;
+ output          ex1_sqrt;                     
+ output  [63:0]  ex1_src0;                     
+ output  [63:0]  ex1_src1;                     
+@@ -89,11 +95,15 @@ output  [63:0]  pipex_dp_vfdsu_freg_data;
+ output  [6 :0]  pipex_dp_vfdsu_vreg;          
+ output          vfdsu_ex2_double;             
+ output          vfdsu_ex2_single;             
++output          vfdsu_ex2_half;
++output          vfdsu_ex2_bfloat;
+ 
+ // &Regs; @25
+ reg             ex1_div;                      
+ reg             ex1_double;                   
+ reg             ex1_single;                   
++reg             ex1_half;
++reg             ex1_bfloat;
+ reg             ex1_sqrt;                     
+ reg             vfdsu_ex2_div;                
+ reg             vfdsu_ex2_double;             
+@@ -101,6 +111,8 @@ reg     [4 :0]  vfdsu_ex2_dst_ereg;
+ reg     [6 :0]  vfdsu_ex2_dst_vreg;           
+ reg     [6 :0]  vfdsu_ex2_iid;                
+ reg             vfdsu_ex2_single;             
++reg             vfdsu_ex2_half;
++reg             vfdsu_ex2_bfloat;
+ reg             vfdsu_ex2_sqrt;               
+ reg     [4 :0]  vfdsu_ex3_dst_ereg;           
+ reg     [6 :0]  vfdsu_ex3_dst_vreg;           
+@@ -175,6 +187,8 @@ begin
+     ex1_sqrt           <= 1'b0;
+     ex1_double         <= 1'b0;
+     ex1_single         <= 1'b0;
++    ex1_half           <= 1'b0;
++    ex1_bfloat         <= 1'b0;
+   end
+   else if(idu_vfpu_rf_pipex_gateclk_sel)
+   begin
+@@ -182,6 +196,8 @@ begin
+     ex1_sqrt           <= idu_vfpu_rf_pipex_func[1];
+     ex1_double         <= idu_vfpu_rf_pipex_func[16];
+     ex1_single         <= idu_vfpu_rf_pipex_func[15];
++    ex1_half           <= idu_vfpu_rf_pipex_func[14];
++    ex1_bfloat         <= idu_vfpu_rf_pipex_func[13];
+   end
+ end
+ assign ex1_scalar         = 1'b1;
+@@ -204,6 +220,8 @@ begin
+     vfdsu_ex2_iid[6:0]      <= 7'b0;
+     vfdsu_ex2_double        <= 1'b0;
+     vfdsu_ex2_single        <= 1'b0;
++    vfdsu_ex2_half          <= 1'b0;
++    vfdsu_ex2_bfloat        <= 1'b0;
+     vfdsu_ex2_div           <=  1'b0;
+     vfdsu_ex2_sqrt          <=  1'b0;
+   end
+@@ -214,6 +232,8 @@ begin
+     vfdsu_ex2_iid[6:0]      <= dp_vfdsu_ex1_pipex_iid[6:0];
+     vfdsu_ex2_double        <= ex1_double;
+     vfdsu_ex2_single        <= ex1_single;
++    vfdsu_ex2_half          <= ex1_half;
++    vfdsu_ex2_bfloat        <= ex1_bfloat;
+     vfdsu_ex2_div           <= ex1_div;
+     vfdsu_ex2_sqrt          <= ex1_sqrt;
+   end
+@@ -224,6 +244,8 @@ begin
+     vfdsu_ex2_iid[6:0]      <= vfdsu_ex2_iid[6:0];
+     vfdsu_ex2_double        <= vfdsu_ex2_double;
+     vfdsu_ex2_single        <= vfdsu_ex2_single;
++    vfdsu_ex2_half          <= ex1_half;
++    vfdsu_ex2_bfloat        <= ex1_bfloat;
+     vfdsu_ex2_div           <= vfdsu_ex2_div;
+     vfdsu_ex2_sqrt          <= vfdsu_ex2_sqrt;
+   end
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
+index cdeb3a3..8e8d66b 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_srt.v
+@@ -49,12 +49,15 @@ module ct_vfdsu_srt(
+   vfdsu_ex2_result_zero,
+   vfdsu_ex2_rm,
+   vfdsu_ex2_single,
++  vfdsu_ex2_half,
++  vfdsu_ex2_bfloat,
+   vfdsu_ex2_sqrt,
+   vfdsu_ex2_srt_skip,
+   vfdsu_ex3_doub_expnt_rst,
+   vfdsu_ex3_double,
+   vfdsu_ex3_dz,
+   vfdsu_ex3_half_expnt_rst,
++  vfdsu_ex3_bfloat_expnt_rst,
+   vfdsu_ex3_id_srt_skip,
+   vfdsu_ex3_nv,
+   vfdsu_ex3_of,
+@@ -74,6 +77,8 @@ module ct_vfdsu_srt(
+   vfdsu_ex3_rslt_denorm,
+   vfdsu_ex3_sing_expnt_rst,
+   vfdsu_ex3_single,
++  vfdsu_ex3_half,
++  vfdsu_ex3_bfloat,
+   vfdsu_ex3_uf
+ );
+ 
+@@ -109,6 +114,8 @@ input           vfdsu_ex2_result_sign;
+ input           vfdsu_ex2_result_zero;                 
+ input   [2 :0]  vfdsu_ex2_rm;                          
+ input           vfdsu_ex2_single;                      
++input           vfdsu_ex2_half;
++input           vfdsu_ex2_bfloat;
+ input           vfdsu_ex2_sqrt;                        
+ input           vfdsu_ex2_srt_skip;                    
+ output          srt_ctrl_rem_zero;                     
+@@ -118,6 +125,7 @@ output  [12:0]  vfdsu_ex3_doub_expnt_rst;
+ output          vfdsu_ex3_double;                      
+ output          vfdsu_ex3_dz;                          
+ output  [12:0]  vfdsu_ex3_half_expnt_rst;              
++output  [12:0]  vfdsu_ex3_bfloat_expnt_rst;
+ output          vfdsu_ex3_id_srt_skip;                 
+ output          vfdsu_ex3_nv;                          
+ output          vfdsu_ex3_of;                          
+@@ -137,16 +145,20 @@ output  [2 :0]  vfdsu_ex3_rm;
+ output          vfdsu_ex3_rslt_denorm;                 
+ output  [8 :0]  vfdsu_ex3_sing_expnt_rst;              
+ output          vfdsu_ex3_single;                      
++output          vfdsu_ex3_half;
++output          vfdsu_ex3_bfloat;
+ output          vfdsu_ex3_uf;                          
+ 
+ // &Regs; @24
+ reg     [52:0]  ex2_result_double_denorm_round_add_num; 
+ reg     [52:0]  ex2_result_half_denorm_round_add_num;  
+ reg     [52:0]  ex2_result_single_denorm_round_add_num; 
++reg     [52:0]  ex2_result_bfloat_denorm_round_add_num;
+ reg     [12:0]  vfdsu_ex3_doub_expnt_rst;              
+ reg             vfdsu_ex3_double;                      
+ reg             vfdsu_ex3_dz;                          
+ reg     [12:0]  vfdsu_ex3_half_expnt_rst;              
++reg     [12:0]  vfdsu_ex3_bfloat_expnt_rst;
+ reg             vfdsu_ex3_id_srt_skip;                 
+ reg             vfdsu_ex3_nv;                          
+ reg             vfdsu_ex3_of;                          
+@@ -165,6 +177,8 @@ reg     [2 :0]  vfdsu_ex3_rm;
+ reg             vfdsu_ex3_rslt_denorm;                 
+ reg     [8 :0]  vfdsu_ex3_sing_expnt_rst;              
+ reg             vfdsu_ex3_single;                      
++reg             vfdsu_ex3_half;
++reg             vfdsu_ex3_bfloat;
+ reg             vfdsu_ex3_uf;                          
+ 
+ // &Wires; @25
+@@ -191,6 +205,11 @@ wire            ex2_half_expnt_uf;
+ wire            ex2_half_id_nor_srt_skip;              
+ wire            ex2_half_potnt_of;                     
+ wire            ex2_half_potnt_uf;                     
++wire            ex2_bfloat_expnt_of;
++wire            ex2_bfloat_expnt_uf;
++wire            ex2_bfloat_id_nor_srt_skip;
++wire            ex2_bfloat_potnt_of;
++wire            ex2_bfloat_potnt_uf;
+ wire            ex2_id_nor_srt_skip;                   
+ wire            ex2_of;                                
+ wire            ex2_of_plus;                           
+@@ -253,6 +272,8 @@ wire            vfdsu_ex2_result_sign;
+ wire            vfdsu_ex2_result_zero;                 
+ wire    [2 :0]  vfdsu_ex2_rm;                          
+ wire            vfdsu_ex2_single;                      
++wire            vfdsu_ex2_half;
++wire            vfdsu_ex2_bfloat;
+ wire            vfdsu_ex2_sqrt;                        
+ wire            vfdsu_ex2_srt_skip;                    
+ wire            vfdsu_ex3_rem_zero;                    
+@@ -281,25 +302,33 @@ assign ex2_sing_expnt_of = ~vfdsu_ex2_expnt_rst[9] && (vfdsu_ex2_expnt_rst[8]
+ assign ex2_half_expnt_of = ~vfdsu_ex2_expnt_rst[6] && (vfdsu_ex2_expnt_rst[5] 
+                                                       || (vfdsu_ex2_expnt_rst[4]  &&
+                                                           |vfdsu_ex2_expnt_rst[3:0]));
++assign ex2_bfloat_expnt_of = ~vfdsu_ex2_expnt_rst[9] && (vfdsu_ex2_expnt_rst[8]
++                                                      || (vfdsu_ex2_expnt_rst[7]  &&
++                                                          |vfdsu_ex2_expnt_rst[6:0]));
+ assign ex2_expnt_of      = vfdsu_ex2_double ? ex2_doub_expnt_of :
+-                                              vfdsu_ex2_single  ? ex2_sing_expnt_of
+-                                                                : ex2_half_expnt_of;
++                                              vfdsu_ex2_single  ? ex2_sing_expnt_of :
++                                              vfdsu_ex2_half    ? ex2_half_expnt_of : ex2_bfloat_expnt_of;
+ assign ex2_potnt_of_pre  = vfdsu_ex2_double ? ex2_doub_potnt_of :
+-                           vfdsu_ex2_single ? ex2_sing_potnt_of : ex2_half_potnt_of;   
+-assign ex2_potnt_uf_pre  = vfdsu_ex2_double ? ex2_doub_potnt_uf : 
+-                           vfdsu_ex2_single ? ex2_sing_potnt_uf : ex2_half_potnt_uf;
++                           vfdsu_ex2_single ? ex2_sing_potnt_of :
++                           vfdsu_ex2_half   ? ex2_half_potnt_of : ex2_bfloat_potnt_of;
++assign ex2_potnt_uf_pre  = vfdsu_ex2_double ? ex2_doub_potnt_uf :
++                           vfdsu_ex2_single ? ex2_sing_potnt_uf :
++                           vfdsu_ex2_half   ? ex2_half_potnt_uf : ex2_bfloat_potnt_uf;
+ assign ex2_expnt_uf      = vfdsu_ex2_double ? ex2_doub_expnt_uf :
+-                           vfdsu_ex2_single ? ex2_sing_expnt_uf : ex2_half_expnt_uf;
++                           vfdsu_ex2_single ? ex2_sing_expnt_uf :
++                           vfdsu_ex2_half   ? ex2_half_expnt_uf : ex2_bfloat_expnt_uf;
+ assign ex2_id_nor_srt_skip   = vfdsu_ex2_double ? ex2_double_id_nor_srt_skip :
+-                               vfdsu_ex2_single ? ex2_single_id_nor_srt_skip
+-                                                : ex2_half_id_nor_srt_skip; 
++                               vfdsu_ex2_single ? ex2_single_id_nor_srt_skip :
++                               vfdsu_ex2_half   ? ex2_half_id_nor_srt_skip   : ex2_bfloat_id_nor_srt_skip;
+ assign ex2_result_denorm_round_add_num[52:0] = vfdsu_ex2_double ? 
+                                                ex2_result_double_denorm_round_add_num[52:0] :
+                                                vfdsu_ex2_single ? 
+                                                ex2_result_single_denorm_round_add_num[52:0] :
+-                                               ex2_result_half_denorm_round_add_num[52:0];
+-                                             
+-                                                      
++                                               vfdsu_ex2_half   ?
++                                               ex2_result_half_denorm_round_add_num[52:0] :
++                                               ex2_result_bfloat_denorm_round_add_num[52:0];
++
++
+ //potential overflow when E1-E2 = 128/1024
+ assign ex2_doub_potnt_of = ~vfdsu_ex2_expnt_rst[12] && 
+                            ~vfdsu_ex2_expnt_rst[11] &&
+@@ -313,6 +342,10 @@ assign ex2_half_potnt_of = ~vfdsu_ex2_expnt_rst[6]  &&
+                            ~vfdsu_ex2_expnt_rst[5]  &&
+                             vfdsu_ex2_expnt_rst[4]  &&
+                           ~|vfdsu_ex2_expnt_rst[3:0];  
++assign ex2_bfloat_potnt_of = ~vfdsu_ex2_expnt_rst[9]  &&
++                           ~vfdsu_ex2_expnt_rst[8]  &&
++                            vfdsu_ex2_expnt_rst[7]  &&
++                          ~|vfdsu_ex2_expnt_rst[6:0];
+ assign ex2_potnt_of      = ex2_potnt_of_pre && 
+                            vfdsu_ex2_op0_norm && 
+                            vfdsu_ex2_op1_norm && 
+@@ -321,6 +354,7 @@ assign ex2_potnt_of      = ex2_potnt_of_pre &&
+ //When input is normal, underflow when E1-E2 <= -127/-1023/-15
+ assign ex2_doub_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hc01);
+ assign ex2_sing_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hf81);
++assign ex2_bfloat_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hf81);
+ assign ex2_half_expnt_uf = vfdsu_ex2_expnt_rst[12] && (vfdsu_ex2_expnt_rst[11:0] <= 12'hff1);
+ assign ex2_half_potnt_uf = &vfdsu_ex2_expnt_rst[6:4]   &&
+                           ~|vfdsu_ex2_expnt_rst[3:2]   &&
+@@ -337,6 +371,10 @@ assign ex2_sing_potnt_uf = &vfdsu_ex2_expnt_rst[9:7]   &&
+                           ~|vfdsu_ex2_expnt_rst[6:2]   &&
+                             vfdsu_ex2_expnt_rst[1]     &&
+                            !vfdsu_ex2_expnt_rst[0];
++assign ex2_bfloat_potnt_uf = &vfdsu_ex2_expnt_rst[9:7]   &&
++                          ~|vfdsu_ex2_expnt_rst[6:2]   &&
++                            vfdsu_ex2_expnt_rst[1]     &&
++                           !vfdsu_ex2_expnt_rst[0];
+ 
+ assign ex2_potnt_uf      = (ex2_potnt_uf_pre && 
+                             vfdsu_ex2_op0_norm && 
+@@ -371,6 +409,8 @@ assign ex2_single_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12]
+                                      && (vfdsu_ex2_expnt_rst[11:0]<12'hf6a);
+ assign ex2_half_id_nor_srt_skip   =  vfdsu_ex2_expnt_rst[12] 
+                                      && (vfdsu_ex2_expnt_rst[11:0]<12'hfe7);
++assign ex2_bfloat_id_nor_srt_skip =  vfdsu_ex2_expnt_rst[12]
++                                     && (vfdsu_ex2_expnt_rst[11:0]<12'hf6a);
+ assign ex2_rslt_denorm            = ex2_uf;
+ 
+ //=======================EX2 skip srt iteration======================
+@@ -490,6 +530,23 @@ endcase
+ // &CombEnd; @248
+ end
+ 
++// &CombBeg; @204
++always @( vfdsu_ex2_expnt_rst[12:0])
++begin
++case(vfdsu_ex2_expnt_rst[12:0])
++  13'h1f82:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h200000000000; //-126 1
++  13'h1f81:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h400000000000; //-127 0
++  13'h1f80:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h800000000000; //-128 -1
++  13'h1f7f:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h1000000000000; //-129 -2
++  13'h1f7e:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h2000000000000; //-130 -3
++  13'h1f7d:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h4000000000000; //-131 -4
++  13'h1f7c:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h8000000000000; //-132 -5
++  13'h1f7b:ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h10000000000000; //-133 -6
++  default: ex2_result_bfloat_denorm_round_add_num[52:0] = 53'h0;  // -23
++endcase
++// &CombEnd; @232
++end
++
+ //===================special result========================
+ assign ex2_result_zero = vfdsu_ex2_result_zero;
+ assign ex2_result_qnan = vfdsu_ex2_result_qnan;
+@@ -541,6 +598,7 @@ begin
+     vfdsu_ex3_doub_expnt_rst[12:0] <= 13'b0;
+     vfdsu_ex3_sing_expnt_rst[8:0] <= 9'b0;
+     vfdsu_ex3_half_expnt_rst[12:0] <= 13'b0;
++    vfdsu_ex3_bfloat_expnt_rst[12:0] <= 13'b0;
+     vfdsu_ex3_result_sign     <= 1'b0;
+     vfdsu_ex3_qnan_sign       <= 1'b0;    
+     vfdsu_ex3_qnan_f[51:0]    <= 52'b0;
+@@ -551,6 +609,8 @@ begin
+     vfdsu_ex3_id_srt_skip     <= 1'b0;
+     vfdsu_ex3_double          <=  1'b0;
+     vfdsu_ex3_single          <=  1'b0;
++    vfdsu_ex3_half            <=  1'b0;
++    vfdsu_ex3_bfloat          <=  1'b0;
+   end
+   else if(ex2_pipedown)
+   begin
+@@ -569,6 +629,7 @@ begin
+     vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
+     vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex2_expnt_rst[8:0];
+     vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
++    vfdsu_ex3_bfloat_expnt_rst[12:0] <= vfdsu_ex2_expnt_rst[12:0];
+     vfdsu_ex3_result_sign     <= vfdsu_ex2_result_sign;
+     vfdsu_ex3_qnan_sign       <= vfdsu_ex2_qnan_sign;    
+     vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex2_qnan_f[51:0];
+@@ -579,6 +640,8 @@ begin
+     vfdsu_ex3_id_srt_skip     <= ex2_id_nor_srt_skip;
+     vfdsu_ex3_double          <= vfdsu_ex2_double;
+     vfdsu_ex3_single          <= vfdsu_ex2_single;
++    vfdsu_ex3_half            <= vfdsu_ex2_half;
++    vfdsu_ex3_bfloat          <= vfdsu_ex2_bfloat;
+   end
+   else
+   begin
+@@ -597,6 +660,7 @@ begin
+     vfdsu_ex3_doub_expnt_rst[12:0] <= vfdsu_ex3_doub_expnt_rst[12:0];
+     vfdsu_ex3_sing_expnt_rst[8:0] <= vfdsu_ex3_sing_expnt_rst[8:0];
+     vfdsu_ex3_half_expnt_rst[12:0] <= vfdsu_ex3_half_expnt_rst[12:0];
++    vfdsu_ex3_bfloat_expnt_rst[12:0] <= vfdsu_ex3_bfloat_expnt_rst[12:0];
+     vfdsu_ex3_result_sign     <= vfdsu_ex3_result_sign;
+     vfdsu_ex3_qnan_sign       <= vfdsu_ex3_qnan_sign;     
+     vfdsu_ex3_qnan_f[51:0]    <= vfdsu_ex3_qnan_f[51:0];
+@@ -607,6 +671,8 @@ begin
+     vfdsu_ex3_id_srt_skip    <=  vfdsu_ex3_id_srt_skip;
+     vfdsu_ex3_double          <= vfdsu_ex3_double;
+     vfdsu_ex3_single          <= vfdsu_ex3_single;
++    vfdsu_ex3_half            <= vfdsu_ex3_half;
++    vfdsu_ex3_bfloat          <= vfdsu_ex3_bfloat;
+   end
+ end
+ assign vfdsu_ex3_rem_zero       =  ~|srt_remainder[60:0];
+diff --git a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
+index f884625..28ca259 100644
+--- a/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
++++ b/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_top.v
+@@ -99,6 +99,8 @@ wire            ex1_double;
+ wire            ex1_pipedown;                 
+ wire            ex1_scalar;                   
+ wire            ex1_single;                   
++wire            ex1_half;
++wire            ex1_bfloat;
+ wire            ex1_sqrt;                     
+ wire    [63:0]  ex1_src0;                     
+ wire    [63:0]  ex1_src1;                     
+@@ -128,6 +130,8 @@ wire            vfdsu_dp_fdiv_busy;
+ wire            vfdsu_dp_inst_wb_req;         
+ wire            vfdsu_ex2_double;             
+ wire            vfdsu_ex2_single;             
++wire            vfdsu_ex2_half;
++wire            vfdsu_ex2_bfloat;
+ wire            vfdsu_ifu_debug_ex2_wait;     
+ wire            vfdsu_ifu_debug_idle;         
+ wire            vfdsu_ifu_debug_pipe_busy;    
+@@ -234,6 +238,8 @@ ct_vfdsu_ctrl  x_ct_vfdsu_ctrl (
+   .ex1_double                  (ex1_double                 ),
+   .ex1_pipedown                (ex1_pipedown               ),
+   .ex1_single                  (ex1_single                 ),
++  .ex1_half                    (ex1_half                   ),
++  .ex1_bfloat                  (ex1_bfloat                 ),
+   .ex2_data_clk                (ex2_data_clk               ),
+   .ex2_pipedown                (ex2_pipedown               ),
+   .ex2_srt_first_round         (ex2_srt_first_round        ),
+@@ -251,6 +257,8 @@ ct_vfdsu_ctrl  x_ct_vfdsu_ctrl (
+   .vfdsu_dp_inst_wb_req        (vfdsu_dp_inst_wb_req       ),
+   .vfdsu_ex2_double            (vfdsu_ex2_double           ),
+   .vfdsu_ex2_single            (vfdsu_ex2_single           ),
++  .vfdsu_ex2_half              (vfdsu_ex2_half             ),
++  .vfdsu_ex2_bfloat            (vfdsu_ex2_bfloat           ),
+   .vfdsu_ifu_debug_ex2_wait    (vfdsu_ifu_debug_ex2_wait   ),
+   .vfdsu_ifu_debug_idle        (vfdsu_ifu_debug_idle       ),
+   .vfdsu_ifu_debug_pipe_busy   (vfdsu_ifu_debug_pipe_busy  )
+@@ -266,6 +274,8 @@ ct_vfdsu_double  x_ct_vfdsu_double (
+   .ex1_pipedown        (ex1_pipedown       ),
+   .ex1_scalar          (ex1_scalar         ),
+   .ex1_single          (ex1_single         ),
++  .ex1_half            (ex1_half           ),
++  .ex1_bfloat          (ex1_bfloat         ),
+   .ex1_sqrt            (ex1_sqrt           ),
+   .ex1_src0            (ex1_src0           ),
+   .ex1_src1            (ex1_src1           ),
+@@ -302,6 +312,8 @@ ct_vfdsu_scalar_dp  x_ct_vfdsu_scalar_dp (
+   .ex1_pipedown                  (ex1_pipedown                 ),
+   .ex1_scalar                    (ex1_scalar                   ),
+   .ex1_single                    (ex1_single                   ),
++  .ex1_half                      (ex1_half                     ),
++  .ex1_bfloat                    (ex1_bfloat                   ),
+   .ex1_sqrt                      (ex1_sqrt                     ),
+   .ex1_src0                      (ex1_src0                     ),
+   .ex1_src1                      (ex1_src1                     ),
+@@ -321,7 +333,9 @@ ct_vfdsu_scalar_dp  x_ct_vfdsu_scalar_dp (
+   .pipex_dp_vfdsu_freg_data      (pipex_dp_vfdsu_freg_data     ),
+   .pipex_dp_vfdsu_vreg           (pipex_dp_vfdsu_vreg          ),
+   .vfdsu_ex2_double              (vfdsu_ex2_double             ),
+-  .vfdsu_ex2_single              (vfdsu_ex2_single             )
++  .vfdsu_ex2_single              (vfdsu_ex2_single             ),
++  .vfdsu_ex2_half                (vfdsu_ex2_half               ),
++  .vfdsu_ex2_bfloat              (vfdsu_ex2_bfloat             )
+ );
+ 
+ 
+-- 
+2.16.5
+

From 0a9568e7a37f105283ff906a371de3b5dc479b1f Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Fri, 27 Sep 2024 11:52:27 +0200
Subject: [PATCH 5/8] Fix illegal Verilog assignment (#15)

---
 docs/CHANGELOG-PULP.md                                       | 5 +++++
 .../C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v      | 2 +-
 .../0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch   | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
index 94d245be..17ef11f5 100644
--- a/docs/CHANGELOG-PULP.md
+++ b/docs/CHANGELOG-PULP.md
@@ -7,6 +7,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
 Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
 
+## [pulp-v0.2.3] - 2024-09-27
+
+### Fix
+- Fix illegal Verilog `'0`
+
 ## [pulp-v0.2.2] - 2024-06-24
 
 ### Added
diff --git a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
index cb3dc8e3..69462eb9 100644
--- a/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
+++ b/vendor/openc910/C910_RTL_FACTORY/gen_rtl/vfdsu/rtl/ct_vfdsu_round.v
@@ -763,7 +763,7 @@ case(vfdsu_ex3_bfloat_expnt_rst[8:0])
   9'h17a:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[56:46]}; //-93 -6
                 bfloat_denorm_lst_frac =  1'b0;
           end//-1022 1
-  default:  begin qt_result_bfloat_denorm_for_round[10:0] = '0;
+  default:  begin qt_result_bfloat_denorm_for_round[10:0] = 11'b0;
                  bfloat_denorm_lst_frac = 1'b0;
             end//-1022 1
 endcase
diff --git a/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch b/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch
index 7d1ce903..fab95f9d 100644
--- a/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch
+++ b/vendor/patches/openc910/0001-Add-FP16ALT-support-to-THMULTI-DivSqrt-unit.patch
@@ -817,7 +817,7 @@ index 6eece52..a419289 100644
 +  9'h17a:begin qt_result_bfloat_denorm_for_round[10:0] = {total_qt_rt_58[56:46]}; //-93 -6
 +                bfloat_denorm_lst_frac =  1'b0;
 +          end//-1022 1
-+  default:  begin qt_result_bfloat_denorm_for_round[10:0] = '0;
++  default:  begin qt_result_bfloat_denorm_for_round[10:0] = 11'b0;
 +                 bfloat_denorm_lst_frac = 1'b0;
 +            end//-1022 1
 +endcase

From 5098afdffd8a48319eedca6fb2fdb7d53fab6172 Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:09:03 +0200
Subject: [PATCH 6/8] Update maintainer in README.md

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b13f00d1..e261e7b4 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,9 @@
 
 Parametric floating-point unit with support for standard RISC-V formats and operations as well as transprecision formats, written in SystemVerilog.
 
-Maintainer: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch><br>
-Principal Author: Stefan Mach <smach@iis.ee.ethz.ch>
+Current Maintainer: Gamze İslamoğlu <gislamoglu@iis.ee.ethz.ch><br>
+Past Maintainer: Luca Bertaccini <lbertaccini@iis.ee.ethz.ch><br>
+Main Author: Stefan Mach <smach@iis.ee.ethz.ch>
 
 ## Features
 

From 29d8a981295a1fb7b6b6c1544843b618d09717ad Mon Sep 17 00:00:00 2001
From: Luca Bertaccini <55843305+lucabertaccini@users.noreply.github.com>
Date: Tue, 5 Aug 2025 18:09:43 +0200
Subject: [PATCH 7/8] Update CODEOWNERS

---
 docs/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/CODEOWNERS b/docs/CODEOWNERS
index 6b8f7762..21c23e3d 100644
--- a/docs/CODEOWNERS
+++ b/docs/CODEOWNERS
@@ -1,2 +1,2 @@
 # Global owners
-*	@lucabertaccini
+*	@gamzeisl

From 8edb8754ef06ae80c59c452b883fdae457e9827e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gamze=20=C4=B0slamo=C4=9Flu?=
 <54476562+gamzeisl@users.noreply.github.com>
Date: Wed, 25 Feb 2026 13:12:48 +0100
Subject: [PATCH 8/8] Add MXDOTP operation group with FP4/FP6/FP8 source format
 support (#17)

* Add FP4, FP6, FP6ALT formats and MXDOTP operation support to fpnew_pkg

Extended fpnew_pkg.sv with new floating-point formats and MXDOTP operation
group for MX dot product operations:

- New formats: FP6(E3M2), FP6ALT(E2M3), FP4(E2M1)
- Increased NUM_FP_FORMATS from 6 to 9
- Added MXDOTP operation group (6th group)
- New operations: MXDOTPF (FP), MXDOTPI (INT)
- Updated all format masks from 6-bit to 9-bit
- Added bias_constant() helper function for MXDOTP
- Updated FPU configurations (DEFAULT_NOREGS, DEFAULT_SNITCH)

* Add MXDOTP multi-format package definitions

Introduces fpnew_mxdotp_multi_pkg.sv with parameterized configuration for
MXDOTP operations supporting mixed-precision
arithmetic with low precision formats.

Configuration:
- Source formats: FP4, FP6, FP6ALT, FP8, FP8ALT, INT8
- Destination formats: FP32, FP16ALT

* Add MXDOTP multi-format core implementation

Add core MXDOTP implementation supporting
very low-precision floating-point formats (FP4, FP6, FP8) and INT8.

New files:
- fpnew_mxdotp_multi_modules.sv: 14 modules implementing
  the MXDOTP datapath (classification, multiplication, shifting,
  accumulation, normalization, rounding)
- fpnew_mxdotp_multi.sv: Top-level MXDOTP unit integrating all modules

* Add MXDOTP wrapper

New file:
- fpnew_mxdotp_multi_wrapper.sv: Wrapper handling operand unpacking,
  FP6 extended operand processing (3-step with unroll factor), NaN-boxing,
  and scale extraction

Changes to core module:
- Add NumPipeRegs and PipeConfig as module parameters
- Compute NUM_INP_REGS, NUM_MID_REGS, NUM_OUT_REGS from parameters

* Extend classifier for MX floating-point formats

Add MX parameter and format-specific classification logic to support low-precision formats used in MXDOTP operations.

Changes:
- Add MX parameter (default 1) to enable MX-specific classification
- FP8ALT (E4M3): No infinity, NaN when exp=all1s and man=all1s
- FP6/FP6ALT/FP4 (E3M2/E2M3/E2M1): No infinity or NaN
- Other formats: Standard IEEE-754 classification

* Add configurable format parameters to MXDOTP wrapper and pkg

* Integrate MXDOTP into opgroup multifmt slice

- Add elaboration-time checks: fatal for Width!=64, missing FP32,
  missing FP8/INT8; warnings for inactive FP6/FP6ALT/FP4
- Add NUM_MX_LANES localparam and lane generation for MXDOTP
- Instantiate fpnew_mxdotp_multi_wrapper with FpFmtConfig and IntFmtConfig

* Update SDOTP wrapper format masks for extended format support

- Widen FpSrcFmtConfig bitmasks from 6b to 9b to match the extended
  NUM_FP_FORMATS (FP6, FP6ALT, FP4 added but masked off for SDOTP)

* Add MXDOTP sources to Bender and src_files

* Update documentation for MXDOTP

* Parameterize MXDOTP format configuration and rename package constants

* Make INT8 optional and unify FP8/INT8 product width

Relax format validation in fpnew_opgroup_multifmt_slice to require only
FP8 and FP8ALT as mandatory base formats, allowing INT8 to be disabled.

* Use bias constant function instead of fixed constant

* Fix default mxdotp operation

* Fix classifier consistency for fp4

* Remove the warning message for MXDOTP about enabled formats
---
 Bender.yml                               |   4 +
 docs/CHANGELOG-PULP.md                   |  18 +
 docs/README.md                           |  37 +-
 src/fpnew_classifier.sv                  |  28 +-
 src/fpnew_mxdotp_multi.sv                | 804 ++++++++++++++++++
 src/fpnew_mxdotp_multi_wrapper.sv        | 245 ++++++
 src/fpnew_opgroup_block.sv               |  22 +-
 src/fpnew_opgroup_multifmt_slice.sv      |  84 +-
 src/fpnew_pkg.sv                         | 131 ++-
 src/fpnew_sdotp_multi_wrapper.sv         |   2 +-
 src/fpnew_top.sv                         |   2 +
 src/mxdotp/fpnew_mxdotp_multi_modules.sv | 987 +++++++++++++++++++++++
 src/mxdotp/fpnew_mxdotp_multi_pkg.sv     | 148 ++++
 src_files.yml                            |   4 +
 14 files changed, 2447 insertions(+), 69 deletions(-)
 create mode 100644 src/fpnew_mxdotp_multi.sv
 create mode 100644 src/fpnew_mxdotp_multi_wrapper.sv
 create mode 100644 src/mxdotp/fpnew_mxdotp_multi_modules.sv
 create mode 100644 src/mxdotp/fpnew_mxdotp_multi_pkg.sv

diff --git a/Bender.yml b/Bender.yml
index b635aa07..6b47a7f1 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -45,6 +45,10 @@ sources:
   - src/fpnew_sdotp_multi.sv
   - src/fpnew_sdotp_multi_wrapper.sv
   - src/fpnew_noncomp.sv
+  - src/mxdotp/fpnew_mxdotp_multi_pkg.sv
+  - src/mxdotp/fpnew_mxdotp_multi_modules.sv
+  - src/fpnew_mxdotp_multi.sv
+  - src/fpnew_mxdotp_multi_wrapper.sv
   - src/fpnew_opgroup_block.sv
   - src/fpnew_opgroup_fmt_slice.sv
   - src/fpnew_opgroup_multifmt_slice.sv
diff --git a/docs/CHANGELOG-PULP.md b/docs/CHANGELOG-PULP.md
index 17ef11f5..2c7c544a 100644
--- a/docs/CHANGELOG-PULP.md
+++ b/docs/CHANGELOG-PULP.md
@@ -7,6 +7,24 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 In this sense, we interpret the "Public API" of a hardware module as its port/parameter list.
 Versions of the IP in the same major relase are "pin-compatible" with each other. Minor relases are permitted to add new parameters as long as their default bindings ensure backwards compatibility.
 
+## [Unreleased]
+
+### Added
+- Add FP6(E3M2), FP6ALT(E2M3), and FP4(E2M1) floating-point formats
+- Add MXDOTP Microscaling dot product multi-format operation group
+  - Supports source formats: FP8, FP8ALT, FP6, FP6ALT, FP4, INT8
+  - Supports destination formats: FP32, FP16ALT
+  - Scaled dot-product and accumulation support with two 8-bit exponent scale factors
+
+### Changed
+- Extend classifier to support MX-specific special cases for FP6, FP6ALT, FP4 formats
+- Increase number of supported FP formats from 6 to 9
+- Increase number of opgroups from 5 to 6
+
+### Notes
+- MXDOTP implementation tested with all element formats enabled, but not yet exhaustively tested with all possible combinations of enabled formats.
+- Known limitations documented in TODO comments (see source files for details)
+
 ## [pulp-v0.2.3] - 2024-09-27
 
 ### Fix
diff --git a/docs/README.md b/docs/README.md
index f00fb3b5..9def7b1f 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -109,8 +109,10 @@ Unless noted otherwise, the first operand `op[0]` is used for the operation.
 | `ADD`      | `0`      | Addition (`op[1] + op[2]`) *note the operand indices*                                                                                                                                                            |
 | `ADD`      | `1`      | Subtraction (`op[1] - op[2]`) *note the operand indices*                                                                                                                                                         |
 | `MUL`      | `0`      | Multiplication (`op[0] * op[1]`)                                                                                                                                                                                 |
-| `SDOTP`    | `0`      | Sum of dot product )                                                                                                                                                                                 |
-| `VSUM`     | `0`      | Vector Inner Sum )                                                                                                                                                                                 |
+| `SDOTP`    | `0`      | Sum of dot product                                                                                                                                                                                               |
+| `VSUM`     | `0`      | Vector Inner Sum                                                                                                                                                                                                 |
+| `MXDOTPF`  | `0`      | Microscaling FP scaled dot product and accumulate                                                                                                                                                 |
+| `MXDOTPI`  | `0`      | Microscaling INT scaled dot product and accumulate |
 | `DIV`      | `0`      | Division (`op[0] / op[1]`)                                                                                                                                                                                       |
 | `SQRT`     | `0`      | Square root                                                                                                                                                                                                      |
 | `SGNJ`     | `0`      | Sign injection, operation encoded in rounding mode<br>`RNE`: `op[0]` with `sign(op[1])`<br>`RTZ`: `op[0]` with `~sign(op[1])`<br>`RDN`: `op[0]` with `sign(op[0]) ^ sign(op[1])`<br>`RUP`: `op[0]` (passthrough) |
@@ -130,7 +132,7 @@ Unless noted otherwise, the first operand `op[0]` is used for the operation.
 
 ##### `fp_format_e` - FP Formats
 
-Enumeration of type `logic [2:0]` holding the supported FP formats.
+Enumeration of type `logic [3:0]` holding the supported FP formats.
 
 | Enumerator | Format        | Width  | Exp. Bits | Man. Bits |
 | ---------- | ------------- | -----: | :-------: | :-------: |
@@ -140,10 +142,13 @@ Enumeration of type `logic [2:0]` holding the supported FP formats.
 | `FP8`      | binary8       | 8 bit  | 5         | 2         |
 | `FP16ALT`  | binary16alt   | 16 bit | 8         | 7         |
 | `FP8ALT`   | binary8alt    | 8 bit  | 4         | 3         |
+| `FP6`      | binary6       | 6 bit  | 3         | 2         |
+| `FP6ALT`   | binary6alt    | 6 bit  | 2         | 3         |
+| `FP4`      | binary4       | 4 bit  | 2         | 1         |
 
 The following global parameters associated with FP formats are set in `fpnew_pkg`:
 ```SystemVerilog
-localparam int unsigned NUM_FP_FORMATS = 6;
+localparam int unsigned NUM_FP_FORMATS = 9;
 localparam int unsigned FP_FORMAT_BITS = $clog2(NUM_FP_FORMATS);
 ```
 
@@ -286,7 +291,7 @@ Otherwise, synthesis tools can optimize away any logic associated with this form
 
 #### `Implementation` - Implementation Options
 
-The FPU is divided into five operation groups,  `ADDMUL`, `DIVSQRT`, `NONDOMP`, `CONV`, and `DOTP` (see [Architecture: Top-Level](#top-level)).
+The FPU is divided into six operation groups: `ADDMUL`, `DIVSQRT`, `NONCOMP`, `CONV`, `DOTP`, and `MXDOTP` (see [Architecture: Top-Level](#top-level)).
 The `Implementation` parameter controls the implementation of these operation groups.
 It is of type `fpu_implementation_t` which is defined as:
 ```SystemVerilog
@@ -328,18 +333,19 @@ The unit type `unit_type_t` is an enumeration of type `logic [1:0]` holding the
 The `UnitTypes` parameter allows to control resources used for the FPU by either removing operation units for certain formats and operations, or merging multiple formats into one.
 Currently, the follwoing unit types are available for the FPU operation groups:
 
-|            |      `ADDMUL`      |     `DIVSQRT`      |     `NONCOMP`      |       `CONV`       |       `DOTP`       |
-|------------|--------------------|--------------------|--------------------|--------------------|--------------------|
-| `PARALLEL` | :heavy_check_mark: |                    | :heavy_check_mark: |                    |                    |
-| `MERGED`   | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: |
+|            |      `ADDMUL`      |     `DIVSQRT`      |     `NONCOMP`      |       `CONV`       |       `DOTP`       |      `MXDOTP`      |
+|------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
+| `PARALLEL` | :heavy_check_mark: |                    | :heavy_check_mark: |                    |                    |                    |
+| `MERGED`   | :heavy_check_mark: | :heavy_check_mark: |                    | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
 
 *Default*:
 ```SystemVerilog
-'{'{default: PARALLEL}, // ADDMUL
-  '{default: MERGED},   // DIVSQRT
-  '{default: PARALLEL}, // NONCOMP
-  '{default: MERGED},   // CONV`
-  '{default: DISABLED}} // DOTP`
+'{'{default: PARALLEL},  // ADDMUL
+  '{default: MERGED},    // DIVSQRT
+  '{default: PARALLEL},  // NONCOMP
+  '{default: MERGED},    // CONV
+  '{default: DISABLED},  // DOTP
+  '{default: DISABLED}}  // MXDOTP
 ```
 (all formats within operation group use same type)
 
@@ -437,7 +443,7 @@ The *operation group* is the highest level of grouping within FPnew and signifie
 
 ![FPnew](fig/top_block.png)
 
-There are currently five operation groups in FPnew which are enumerated in `opgroup_e` as outlined in the following table:
+There are currently six operation groups in FPnew which are enumerated in `opgroup_e` as outlined in the following table:
 
 | Enumerator |                  Description                  |         Associated Operations         |
 |------------|-----------------------------------------------|---------------------------------------|
@@ -446,6 +452,7 @@ There are currently five operation groups in FPnew which are enumerated in `opgr
 | `NONCOMP`  | Non-Computational Operations like Comparisons | `SGNJ`, `MINMAX`, `CMP`, `CLASS`      |
 | `CONV`     | Conversions                                   | `F2I`, `I2F`, `F2F`, `CPKAB`, `CPKCD` |
 | `DOTP`     | Dot Products                                  | `SDOTP`, `EXVSUM`, `VSUM`             |
+| `MXDOTP`   | Microscaling Dot Products                     | `MXDOTPF`, `MXDOTPI`                  |
 
 Most architectural decisions for FPnew are made at very fine granularity.
 The big exception to this is the generation of vectorial hardware which is decided at top level through the `EnableVectors` parameter.
diff --git a/src/fpnew_classifier.sv b/src/fpnew_classifier.sv
index a322946d..632416db 100644
--- a/src/fpnew_classifier.sv
+++ b/src/fpnew_classifier.sv
@@ -16,6 +16,7 @@
 module fpnew_classifier #(
   parameter fpnew_pkg::fp_format_e   FpFormat = fpnew_pkg::fp_format_e'(0),
   parameter int unsigned             NumOperands = 1,
+  parameter int unsigned             MX = 0,
   // Do not change
   localparam int unsigned WIDTH = fpnew_pkg::fp_width(FpFormat)
 ) (
@@ -51,13 +52,30 @@ module fpnew_classifier #(
     // Classify Input
     // ---------------
     always_comb begin : classify_input
-      value         = operands_i[op];
-      is_boxed      = is_boxed_i[op];
-      is_normal     = is_boxed && (value.exponent != '0) && (value.exponent != '1);
+      value    = operands_i[op];
+      is_boxed = is_boxed_i[op];
+
+      if (MX == 1 && FpFormat == fpnew_pkg::fp_format_e'(fpnew_pkg::FP8ALT)) begin
+        // E4M3: No infinity, NaN when exp=all1s and man=all1s
+        is_inf    = 1'b0;
+        is_nan    = !is_boxed || ((value.exponent == '1) && (value.mantissa == '1));
+        is_normal = is_boxed && (value.exponent != '0) && !is_nan;
+      end else if (MX == 1 && (FpFormat == fpnew_pkg::fp_format_e'(fpnew_pkg::FP6) ||
+                                FpFormat == fpnew_pkg::fp_format_e'(fpnew_pkg::FP6ALT) ||
+                                FpFormat == fpnew_pkg::fp_format_e'(fpnew_pkg::FP4))) begin
+        // E3M2, E2M3, E2M1: No infinity or NaN
+        is_inf    = 1'b0;
+        is_nan    = 1'b0;
+        is_normal = is_boxed && (value.exponent != '0);
+      end else begin
+        // Standard IEEE-754 classification (for all other formats and MX=0)
+        is_inf    = is_boxed && ((value.exponent == '1) && (value.mantissa == '0));
+        is_nan    = !is_boxed || ((value.exponent == '1) && (value.mantissa != '0));
+        is_normal = is_boxed && (value.exponent != '0) && (value.exponent != '1);
+      end
+
       is_zero       = is_boxed && (value.exponent == '0) && (value.mantissa == '0);
       is_subnormal  = is_boxed && (value.exponent == '0) && !is_zero;
-      is_inf        = is_boxed && ((value.exponent == '1) && (value.mantissa == '0));
-      is_nan        = !is_boxed || ((value.exponent == '1) && (value.mantissa != '0));
       is_signalling = is_boxed && is_nan && (value.mantissa[MAN_BITS-1] == 1'b0);
       is_quiet      = is_nan && !is_signalling;
       // Assign output for current input
diff --git a/src/fpnew_mxdotp_multi.sv b/src/fpnew_mxdotp_multi.sv
new file mode 100644
index 00000000..6038bf99
--- /dev/null
+++ b/src/fpnew_mxdotp_multi.sv
@@ -0,0 +1,804 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Author: Gamze Islamoglu <gislamoglu@iis.ee.ethz.ch>
+
+`include "common_cells/registers.svh"
+
+module fpnew_mxdotp_multi
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  // By default, all MX formats are enabled for the source and FP32 and FP16ALT are enabled for the destination.
+  parameter fpnew_pkg::fmt_logic_t   FpSrcFmtConfig  = MxdotpSrcFpFmtConfig,
+  parameter fpnew_pkg::ifmt_logic_t  IntSrcFmtConfig = MxdotpSrcIntFmtConfig,
+  parameter fpnew_pkg::fmt_logic_t   FpDstFmtConfig  = MxdotpDstFpFmtConfig,
+  parameter int unsigned             NumPipeRegs = 4,
+  parameter fpnew_pkg::pipe_config_t PipeConfig  = fpnew_pkg::BEFORE,
+  parameter type                     TagType     = logic,
+  parameter type                     AuxType     = logic
+) (
+  input  logic                        clk_i,
+  input  logic                        rst_ni,
+  // Input signals
+  input  logic [VectorSize-1:0][SRC_WIDTH-1:0] operands_a_i,
+  input  logic [VectorSize-1:0][SRC_WIDTH-1:0] operands_b_i,
+  input  logic [1:0] operands_a_fp6_rem_i,
+  input  logic [1:0] operands_b_fp6_rem_i,
+  input  logic [1:0][SCALE_WIDTH-1:0] operands_c_i, // 2 operands
+  input  logic [DST_WIDTH-1:0]        operand_d_i, // 1 operand, accumulator
+  input  logic [NUM_FORMATS-1:0][NUM_OPERANDS-1:0] is_boxed_i,
+  input  fpnew_pkg::roundmode_e       rnd_mode_i,
+  input  fpnew_pkg::operation_e       op_i,
+  input  logic                        op_mod_i,
+  input  fpnew_pkg::fp_format_e       src_fmt_i, // format of the multiplicands
+  input  fpnew_pkg::int_format_e      int_fmt_i, // format of the multiplicands if they are integers
+  input  fpnew_pkg::fp_format_e       dst_fmt_i, // format of the addend and result
+  input  TagType                      tag_i,
+  input  logic                        mask_i,
+  input  AuxType                      aux_i,
+  // Input Handshake
+  input  logic                        in_valid_i,
+  output logic                        in_ready_o,
+  input  logic                        flush_i,
+  // Output signals
+  output logic [DST_WIDTH-1:0]        result_o,
+  output fpnew_pkg::status_t          status_o,
+  output logic                        extension_bit_o,
+  output TagType                      tag_o,
+  output logic                        mask_o,
+  output AuxType                      aux_o,
+  // Output handshake
+  output logic                        out_valid_o,
+  input  logic                        out_ready_i,
+  // Indication of valid data in flight
+  output logic                        busy_o
+);
+
+  // ----------------
+  // Pipeline stages
+  // ----------------
+  localparam int unsigned NUM_INP_REGS = PipeConfig == fpnew_pkg::BEFORE
+                                         ? NumPipeRegs
+                                         : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                                            ? ((NumPipeRegs + 1) / 3)
+                                            : 0);
+  localparam int unsigned NUM_MID_REGS = PipeConfig == fpnew_pkg::INSIDE
+                                         ? NumPipeRegs
+                                         : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                                            ? ((NumPipeRegs + 2) / 3)
+                                            : 0);
+  localparam int unsigned NUM_OUT_REGS = PipeConfig == fpnew_pkg::AFTER
+                                         ? NumPipeRegs
+                                         : (PipeConfig == fpnew_pkg::DISTRIBUTED
+                                            ? (NumPipeRegs / 3)
+                                            : 0);
+
+  // -----------------------------------------
+  // Config-dependent derived localparams
+  // -----------------------------------------
+  // Computed from module parameters instead of package constants
+  localparam int unsigned FP6_VECTOR_SIZE = ((FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) == 1) ?
+                                            (((FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT]) == 1) ? 3 : 11) : 0;
+  localparam int unsigned FP4_VECTOR_SIZE = (FpSrcFmtConfig[fpnew_pkg::FP4] == 1) ?
+                                            (((FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT]) == 1) ?
+                                            (((FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) == 1) ? 5 : 8) : 16) : 0;
+
+  localparam int unsigned INT_SUPER_BITS = fpnew_pkg::max_int_width(IntSrcFmtConfig);
+
+  // FP8/INT8 Lane configuration
+  localparam int unsigned PROD_BITS = fpnew_pkg::maximum(2*INT_SUPER_BITS, 2*PRECISION_BITS+1); // +1 for the sign bit in FP8 product
+
+  localparam int unsigned FP6_SUM_WIDTH = $clog2(FP6_VECTOR_SIZE) + FP6_PROD_SHIFT_WIDTH;
+  localparam int unsigned FP4_SUM_WIDTH = $clog2(FP4_VECTOR_SIZE) + FP4_PROD_SHIFT_WIDTH;
+
+  // ---------------
+  // Input pipeline
+  // ---------------
+  // Selected pipeline output signals as non-arrays
+  logic [VectorSize-1:0][SRC_WIDTH-1:0] operands_a_q;
+  logic [VectorSize-1:0][SRC_WIDTH-1:0] operands_b_q;
+  logic [1:0] operands_a_fp6_rem_q;
+  logic [1:0] operands_b_fp6_rem_q;
+  logic [1:0][SCALE_WIDTH-1:0] operands_c_q;
+  logic [DST_WIDTH-1:0] operand_d_q;
+  fpnew_pkg::fp_format_e src_fmt_q;
+  fpnew_pkg::int_format_e int_fmt_q;
+  fpnew_pkg::fp_format_e dst_fmt_q;
+
+  // Input pipeline signals, index i holds signal after i register stages
+  logic                   [0:NUM_INP_REGS][VectorSize-1:0][SRC_WIDTH-1:0]   inp_pipe_operands_a_q;
+  logic                   [0:NUM_INP_REGS][VectorSize-1:0][SRC_WIDTH-1:0]   inp_pipe_operands_b_q;
+  logic                   [0:NUM_INP_REGS][1:0]                             inp_pipe_operands_a_fp6_rem_q;
+  logic                   [0:NUM_INP_REGS][1:0]                             inp_pipe_operands_b_fp6_rem_q;
+  logic                   [0:NUM_INP_REGS][1:0][SCALE_WIDTH-1:0] inp_pipe_operands_c_q;
+  logic                   [0:NUM_INP_REGS][DST_WIDTH-1:0]        inp_pipe_operand_d_q;
+  logic                   [0:NUM_INP_REGS][NUM_FORMATS-1:0][NUM_OPERANDS-1:0] inp_pipe_is_boxed_q;
+  fpnew_pkg::roundmode_e  [0:NUM_INP_REGS]                       inp_pipe_rnd_mode_q;
+  fpnew_pkg::operation_e  [0:NUM_INP_REGS]                       inp_pipe_op_q;
+  logic                   [0:NUM_INP_REGS]                       inp_pipe_op_mod_q;
+  fpnew_pkg::fp_format_e  [0:NUM_INP_REGS]                       inp_pipe_src_fmt_q;
+  fpnew_pkg::int_format_e [0:NUM_INP_REGS]                       inp_pipe_int_fmt_q;
+  fpnew_pkg::fp_format_e  [0:NUM_INP_REGS]                       inp_pipe_dst_fmt_q;
+  TagType                 [0:NUM_INP_REGS]                       inp_pipe_tag_q;
+  logic                   [0:NUM_INP_REGS]                       inp_pipe_mask_q;
+  AuxType                 [0:NUM_INP_REGS]                       inp_pipe_aux_q;
+  logic                   [0:NUM_INP_REGS]                       inp_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_INP_REGS] inp_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign inp_pipe_operands_a_q[0]   = operands_a_i;
+  assign inp_pipe_operands_b_q[0]   = operands_b_i;
+  assign inp_pipe_operands_a_fp6_rem_q[0] = operands_a_fp6_rem_i;
+  assign inp_pipe_operands_b_fp6_rem_q[0] = operands_b_fp6_rem_i;
+  assign inp_pipe_operands_c_q[0]   = operands_c_i;
+  assign inp_pipe_operand_d_q[0]    = operand_d_i;
+  assign inp_pipe_is_boxed_q[0]     = is_boxed_i;
+  assign inp_pipe_rnd_mode_q[0]     = rnd_mode_i;
+  assign inp_pipe_op_q[0]           = op_i;
+  assign inp_pipe_op_mod_q[0]       = op_mod_i;
+  assign inp_pipe_src_fmt_q[0]      = src_fmt_i;
+  assign inp_pipe_int_fmt_q[0]      = int_fmt_i;
+  assign inp_pipe_dst_fmt_q[0]      = dst_fmt_i;
+  assign inp_pipe_tag_q[0]          = tag_i;
+  assign inp_pipe_mask_q[0]         = mask_i;
+  assign inp_pipe_aux_q[0]          = aux_i;
+  assign inp_pipe_valid_q[0]        = in_valid_i;
+  // Input stage: Propagate pipeline ready signal to upstream circuitry
+  assign in_ready_o = inp_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_INP_REGS; i++) begin : gen_input_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign inp_pipe_ready[i] = inp_pipe_ready[i+1] | ~inp_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(inp_pipe_valid_q[i+1], inp_pipe_valid_q[i], inp_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipeline ready and a valid data item is present
+    assign reg_ena = inp_pipe_ready[i] & inp_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(inp_pipe_operands_a_q[i+1],   inp_pipe_operands_a_q[i],   reg_ena, '0)
+    `FFL(inp_pipe_operands_b_q[i+1],   inp_pipe_operands_b_q[i],   reg_ena, '0)
+    `FFL(inp_pipe_operands_a_fp6_rem_q[i+1], inp_pipe_operands_a_fp6_rem_q[i], reg_ena, '0)
+    `FFL(inp_pipe_operands_b_fp6_rem_q[i+1], inp_pipe_operands_b_fp6_rem_q[i], reg_ena, '0)
+    `FFL(inp_pipe_operands_c_q[i+1],   inp_pipe_operands_c_q[i],   reg_ena, '0)
+    `FFL(inp_pipe_operand_d_q[i+1],    inp_pipe_operand_d_q[i],    reg_ena, '0)
+    `FFL(inp_pipe_is_boxed_q[i+1],     inp_pipe_is_boxed_q[i],     reg_ena, '0)
+    `FFL(inp_pipe_rnd_mode_q[i+1],     inp_pipe_rnd_mode_q[i],     reg_ena, fpnew_pkg::RNE)
+    `FFL(inp_pipe_op_q[i+1],           inp_pipe_op_q[i],           reg_ena, fpnew_pkg::MXDOTPF)
+    `FFL(inp_pipe_op_mod_q[i+1],       inp_pipe_op_mod_q[i],       reg_ena, '0)
+    `FFL(inp_pipe_src_fmt_q[i+1],      inp_pipe_src_fmt_q[i],      reg_ena, fpnew_pkg::fp_format_e'(0))
+    `FFL(inp_pipe_int_fmt_q[i+1],      inp_pipe_int_fmt_q[i],      reg_ena, fpnew_pkg::int_format_e'(0))
+    `FFL(inp_pipe_dst_fmt_q[i+1],      inp_pipe_dst_fmt_q[i],      reg_ena, fpnew_pkg::fp_format_e'(0))
+    `FFL(inp_pipe_tag_q[i+1],          inp_pipe_tag_q[i],          reg_ena, TagType'('0))
+    `FFL(inp_pipe_mask_q[i+1],         inp_pipe_mask_q[i],         reg_ena, '0)
+    `FFL(inp_pipe_aux_q[i+1],          inp_pipe_aux_q[i],          reg_ena, AuxType'('0))
+  end
+  // Output stage: assign selected pipe outputs to signals for later use
+  assign operands_a_q   = inp_pipe_operands_a_q[NUM_INP_REGS];
+  assign operands_b_q   = inp_pipe_operands_b_q[NUM_INP_REGS];
+  assign operands_a_fp6_rem_q = inp_pipe_operands_a_fp6_rem_q[NUM_INP_REGS];
+  assign operands_b_fp6_rem_q = inp_pipe_operands_b_fp6_rem_q[NUM_INP_REGS];
+  assign operands_c_q   = inp_pipe_operands_c_q[NUM_INP_REGS];
+  assign operand_d_q    = inp_pipe_operand_d_q[NUM_INP_REGS];
+  assign src_fmt_q      = inp_pipe_src_fmt_q[NUM_INP_REGS];
+  assign int_fmt_q      = inp_pipe_int_fmt_q[NUM_INP_REGS];
+  assign dst_fmt_q      = inp_pipe_dst_fmt_q[NUM_INP_REGS];
+
+  logic [2*VectorSize-1:0][SRC_WIDTH-1:0] operands_post_inp_pipe;
+  logic [2*FP6_VECTOR_SIZE-1:0][SRC_WIDTH-1:0] fp6_operands_post_inp_pipe;
+  logic [2*FP4_VECTOR_SIZE-1:0][SRC_WIDTH-1:0] fp4_operands_post_inp_pipe;
+
+  logic [VectorSize*SRC_WIDTH-1:0] flat_operands_a_q;
+  logic [VectorSize*SRC_WIDTH-1:0] flat_operands_b_q;
+
+  always_comb begin
+    fp6_operands_post_inp_pipe = '0;
+    fp4_operands_post_inp_pipe = '0;
+    operands_post_inp_pipe = {operands_b_q, operands_a_q};
+    flat_operands_a_q = operands_a_q;
+    flat_operands_b_q = operands_b_q;
+    // TODO: FP6 and FP4 without FP8
+    if (src_fmt_q == fpnew_pkg::FP6 || src_fmt_q == fpnew_pkg::FP6ALT) begin
+      for (int i = 0; i < FP6_VECTOR_SIZE; i++) begin // Last 3 elements use FP6 datapath
+        fp6_operands_post_inp_pipe[i] = {{(SRC_WIDTH-6){1'b0}}, flat_operands_a_q[(48+i*6) +: 6]};
+        fp6_operands_post_inp_pipe[i+FP6_VECTOR_SIZE] = {{(SRC_WIDTH-6){1'b0}}, flat_operands_b_q[(48+i*6) +: 6]};
+        if (i == FP6_VECTOR_SIZE-1) begin // Last element of the FP6 remainder extends to 66 bits
+          fp6_operands_post_inp_pipe[i][5:4] = operands_a_fp6_rem_q;
+          fp6_operands_post_inp_pipe[i+FP6_VECTOR_SIZE][5:4] = operands_b_fp6_rem_q;
+        end
+      end
+      for (int i = 0; i < VectorSize; i++) begin // Top 8 elements use FP8 datapath
+        operands_post_inp_pipe[i] = {{(SRC_WIDTH-6){1'b0}}, flat_operands_a_q[(i*6) +: 6]};
+        operands_post_inp_pipe[i+VectorSize] = {{(SRC_WIDTH-6){1'b0}}, flat_operands_b_q[(i*6) +: 6]};
+      end
+    end else if (src_fmt_q == fpnew_pkg::FP4) begin
+      for (int i = 0; i < VectorSize; i++) begin
+        if (i < FP6_VECTOR_SIZE) begin // First 3 elements use FP6 datapath
+          fp6_operands_post_inp_pipe[i] = {{(SRC_WIDTH-4){1'b0}}, operands_a_q[i][7:4]};
+          fp6_operands_post_inp_pipe[i+FP6_VECTOR_SIZE] = {{(SRC_WIDTH-4){1'b0}}, operands_b_q[i][7:4]};
+        end else begin // Last 5 elements use FP4 datapath, remaining elements already use FP8 datapath via operands_post_inp_pipe
+          fp4_operands_post_inp_pipe[i-FP6_VECTOR_SIZE] = {{(SRC_WIDTH-4){1'b0}}, operands_a_q[i][7:4]};
+          fp4_operands_post_inp_pipe[i-FP6_VECTOR_SIZE+FP4_VECTOR_SIZE] = {{(SRC_WIDTH-4){1'b0}}, operands_b_q[i][7:4]};
+        end
+      end
+    end
+  end
+
+  // -----------------
+  // Input processing
+  // -----------------
+  logic src_is_int; // if 0, it's a float
+
+  assign src_is_int = (inp_pipe_op_q[NUM_INP_REGS] == fpnew_pkg::MXDOTPI);
+
+  fp_src_t [VectorSize-1:0] operands_a, operands_b;
+  logic signed [1:0][SCALE_WIDTH-1:0] operands_c;
+  fp_dst_t             operand_d;
+  fpnew_pkg::fp_info_t [VectorSize-1:0] info_a, info_b;
+  fpnew_pkg::fp_info_t [1:0] info_c;
+  fpnew_pkg::fp_info_t info_d;
+
+  fp6_src_t [FP6_VECTOR_SIZE-1:0] fp6_operands_a, fp6_operands_b;
+  fpnew_pkg::fp_info_t [FP6_VECTOR_SIZE-1:0] fp6_info_a, fp6_info_b;
+
+  fp4_src_t [FP4_VECTOR_SIZE-1:0] fp4_operands_a, fp4_operands_b;
+  fpnew_pkg::fp_info_t [FP4_VECTOR_SIZE-1:0] fp4_info_a, fp4_info_b;
+
+  fpnew_mxdotp_classifier #(
+    .FpSrcFmtConfig ( FpSrcFmtConfig ),
+    .FpDstFmtConfig ( FpDstFmtConfig ),
+    .FP6VectorSize  ( FP6_VECTOR_SIZE ),
+    .FP4VectorSize  ( FP4_VECTOR_SIZE ),
+    .NumInpRegs     ( NUM_INP_REGS )
+  ) i_classifier (
+    .operands_post_inp_pipe(operands_post_inp_pipe),
+    .fp6_operands_post_inp_pipe(fp6_operands_post_inp_pipe),
+    .fp4_operands_post_inp_pipe(fp4_operands_post_inp_pipe),
+    .operands_c_q(operands_c_q),
+    .operand_d_q(operand_d_q),
+    .inp_pipe_is_boxed_q(inp_pipe_is_boxed_q),
+    .src_fmt_q(src_fmt_q),
+    .src_is_int(src_is_int),
+    .dst_fmt_q(dst_fmt_q),
+    .inp_pipe_op_mod_q(inp_pipe_op_mod_q),
+    .info_a(info_a),
+    .fp6_info_a(fp6_info_a),
+    .fp4_info_a(fp4_info_a),
+    .info_b(info_b),
+    .fp6_info_b(fp6_info_b),
+    .fp4_info_b(fp4_info_b),
+    .info_c(info_c),
+    .info_d(info_d),
+    .operands_a(operands_a),
+    .fp6_operands_a(fp6_operands_a),
+    .fp4_operands_a(fp4_operands_a),
+    .operands_b(operands_b),
+    .fp6_operands_b(fp6_operands_b),
+    .fp4_operands_b(fp4_operands_b),
+    .operands_c(operands_c),
+    .operand_d(operand_d)
+  );
+
+  // ---------------------
+  // Special case handling
+  // ---------------------
+
+  logic [DST_WIDTH-1:0] special_result;
+  fpnew_pkg::status_t   special_status;
+  logic                 result_is_special;
+
+  // Inf and NaN do not exists in FP6 and FP4 formats
+  if (FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT]) begin : special_case_handling
+    fpnew_mxdotp_special_cases #(
+      .FpDstFmtConfig ( FpDstFmtConfig )
+    ) i_special_cases (
+      .operands_a(operands_a),
+      .operands_b(operands_b),
+      .operands_c(operands_c),
+      .operand_d(operand_d),
+      .info_a(info_a),
+      .info_b(info_b),
+      .info_c(info_c),
+      .info_d(info_d),
+      .dst_fmt_q(dst_fmt_q),
+      .special_result(special_result),
+      .special_status(special_status),
+      .result_is_special(result_is_special)
+    );
+  end else begin : no_special_case_handling
+    assign special_result = '0;
+    assign special_status = fpnew_pkg::status_t'(0);
+    assign result_is_special = 1'b0;
+  end
+
+  // ------------------
+  // Scale data path
+  // ------------------
+  logic signed [SCALE_WIDTH:0] scale; // +1 for addition
+
+  fpnew_mxdotp_scale_adder #(
+  ) i_scale_adder (
+    .operands_c(operands_c),
+    .scale(scale)
+  );
+
+  // ------------------
+  // Product data path
+  // ------------------
+  logic signed [VectorSize-1:0][PROD_BITS-1:0] product_signed;  // two's complement product, already signed
+  logic signed [FP6_VECTOR_SIZE-1:0][2*FP6_PREC_BITS:0] fp6_product_signed;  // two's complement product, +1 for sign bit
+  logic signed [FP4_VECTOR_SIZE-1:0][2*FP4_PREC_BITS:0] fp4_product_signed;  // two's complement product, +1 for sign bit
+
+  if (IntSrcFmtConfig[fpnew_pkg::INT8]) begin : int8_multiplier
+    fpnew_mxdotp_signed_vector_multiplier #(
+      .SrcType(fp_src_t),
+      .LocalVectorSize(VectorSize),
+      .PrecisionBits(INT_SUPER_BITS)
+    ) i_vector_multiplier_int8 (
+      .operands_a(operands_a),
+      .operands_b(operands_b),
+      .src_fmt_q(src_fmt_q),
+      .int_fmt_q(int_fmt_q),
+      .src_is_int(src_is_int),
+      .info_a(info_a),
+      .info_b(info_b),
+      .product_signed(product_signed)
+    );
+  end else if (FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT]) begin : fp8_multiplier
+    fpnew_mxdotp_vector_multiplier #(
+      .SrcType(fp_src_t),
+      .LocalVectorSize(VectorSize),
+      .PrecisionBits(PRECISION_BITS)
+    ) i_vector_multiplier_fp8 (
+      .operands_a(operands_a),
+      .operands_b(operands_b),
+      .info_a(info_a),
+      .info_b(info_b),
+      .product_signed(product_signed)
+    );
+  end else begin : no_fp8_multiplier
+    assign product_signed = '0;
+  end
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) begin : fp6_multiplier
+    fpnew_mxdotp_vector_multiplier #(
+      .SrcType(fp6_src_t),
+      .LocalVectorSize(FP6_VECTOR_SIZE),
+      .PrecisionBits(FP6_PREC_BITS)
+    ) i_vector_multiplier_fp6 (
+      .operands_a(fp6_operands_a),
+      .operands_b(fp6_operands_b),
+      .info_a(fp6_info_a),
+      .info_b(fp6_info_b),
+      .product_signed(fp6_product_signed)
+    );
+  end else begin : no_fp6_multiplier
+    assign fp6_product_signed = '0;
+  end
+  if (FpSrcFmtConfig[fpnew_pkg::FP4]) begin : fp4_multiplier
+    fpnew_mxdotp_vector_multiplier #(
+      .SrcType(fp4_src_t),
+      .LocalVectorSize(FP4_VECTOR_SIZE),
+      .PrecisionBits(FP4_PREC_BITS)
+    ) i_vector_multiplier_fp4 (
+      .operands_a(fp4_operands_a),
+      .operands_b(fp4_operands_b),
+      .info_a(fp4_info_a),
+      .info_b(fp4_info_b),
+      .product_signed(fp4_product_signed)
+    );
+  end else begin : no_fp4_multiplier
+    assign fp4_product_signed = '0;
+  end
+
+  // ------------------
+  // Shift data path
+  // ------------------
+  logic signed [VectorSize-1:0][PROD_SHIFT_WIDTH-1:0] shifted_product;
+  logic signed [FP6_VECTOR_SIZE-1:0][FP6_PROD_SHIFT_WIDTH-1:0] fp6_shifted_product;
+  logic signed [FP4_VECTOR_SIZE-1:0][FP4_PROD_SHIFT_WIDTH-1:0] fp4_shifted_product;
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT]) begin : fp8_product_shifter
+    fpnew_mxdotp_product_shifter #(
+      .SrcType(fp_src_t),
+      .LocalVectorSize(VectorSize),
+      .SrcFmt(fpnew_pkg::FP8), // TODO: For now, we assume that FP8 and FP8ALT are always enabled together
+      .ProductBits(PROD_BITS),
+      .ExpWidth(EXP_WIDTH),
+      .OutputWidth(PROD_SHIFT_WIDTH)
+    ) i_product_shifter_fp8 (
+      .operands_a(operands_a),
+      .operands_b(operands_b),
+      .info_a(info_a),
+      .info_b(info_b),
+      .product_signed(product_signed),
+      .src_fmt_q(src_fmt_q),
+      .int_fmt_q(int_fmt_q),
+      .src_is_int(src_is_int),
+      .shifted_product(shifted_product)
+    );
+  end else begin : no_fp8_product_shifter
+    assign shifted_product = '0;
+  end
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) begin : fp6_product_shifter
+    fpnew_mxdotp_product_shifter #(
+      .SrcType(fp6_src_t),
+      .LocalVectorSize(FP6_VECTOR_SIZE),
+      .SrcFmt(fpnew_pkg::FP6), // TODO: For now, we assume that FP6 and FP6ALT are always enabled together
+      .ProductBits(2*FP6_PREC_BITS+1),
+      .ExpWidth(5),
+      .OutputWidth(FP6_PROD_SHIFT_WIDTH)
+    ) i_product_shifter_fp6 (
+      .operands_a(fp6_operands_a),
+      .operands_b(fp6_operands_b),
+      .info_a(fp6_info_a),
+      .info_b(fp6_info_b),
+      .product_signed(fp6_product_signed),
+      .src_fmt_q(src_fmt_q),
+      .int_fmt_q(int_fmt_q),
+      .src_is_int(src_is_int),
+      .shifted_product(fp6_shifted_product)
+    );
+  end else begin : no_fp6_product_shifter
+    assign fp6_shifted_product = '0;
+  end
+  if (FpSrcFmtConfig[fpnew_pkg::FP4]) begin : fp4_product_shifter
+    fpnew_mxdotp_product_shifter #(
+      .SrcType(fp4_src_t),
+      .LocalVectorSize(FP4_VECTOR_SIZE),
+      .SrcFmt(fpnew_pkg::FP4),
+      .ProductBits(2*FP4_PREC_BITS+1),
+      .ExpWidth(3),
+      .OutputWidth(FP4_PROD_SHIFT_WIDTH)
+    ) i_product_shifter_fp4 (
+      .operands_a(fp4_operands_a),
+      .operands_b(fp4_operands_b),
+      .info_a(fp4_info_a),
+      .info_b(fp4_info_b),
+      .product_signed(fp4_product_signed),
+      .src_fmt_q(src_fmt_q),
+      .int_fmt_q(int_fmt_q),
+      .src_is_int(src_is_int),
+      .shifted_product(fp4_shifted_product)
+    );
+  end else begin : no_fp4_product_shifter
+    assign fp4_shifted_product = '0;
+  end
+
+  // ------------------
+  // Adder data path
+  // ------------------
+  logic signed [SOP_FIXED_WIDTH-1:0] sum_product_fp8;
+  logic signed [FP6_SUM_WIDTH-1:0]   sum_product_fp6;
+  logic signed [FP4_SUM_WIDTH-1:0]   sum_product_fp4;
+  logic signed [FIXED_SUM_WIDTH-1:0] sum_product;
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP8] || FpSrcFmtConfig[fpnew_pkg::FP8ALT] || IntSrcFmtConfig[fpnew_pkg::INT8]) begin : fp8_adder_tree
+    fpnew_mxdotp_adder_tree #(
+      .LocalVectorSize(VectorSize),
+      .InputWidth(PROD_SHIFT_WIDTH),
+      .OutputWidth(SOP_FIXED_WIDTH)
+    ) i_adder_tree_fp8 (
+      .shifted_product(shifted_product),
+      .sum_product(sum_product_fp8)
+    );
+  end else begin : no_fp8_adder_tree
+    assign sum_product_fp8 = '0;
+  end
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) begin : fp6_adder_tree
+    fpnew_mxdotp_adder_tree #(
+      .LocalVectorSize(FP6_VECTOR_SIZE),
+      .InputWidth(FP6_PROD_SHIFT_WIDTH),
+      .OutputWidth(FP6_SUM_WIDTH)
+    ) i_adder_tree_fp6 (
+      .shifted_product(fp6_shifted_product),
+      .sum_product(sum_product_fp6)
+    );
+  end else begin : no_fp6_adder_tree
+    assign sum_product_fp6 = '0;
+  end
+  if (FpSrcFmtConfig[fpnew_pkg::FP4]) begin : fp4_adder_tree
+    fpnew_mxdotp_adder_tree #(
+      .LocalVectorSize(FP4_VECTOR_SIZE),
+      .InputWidth(FP4_PROD_SHIFT_WIDTH),
+      .OutputWidth(FP4_SUM_WIDTH)
+    ) i_adder_tree_fp4 (
+      .shifted_product(fp4_shifted_product),
+      .sum_product(sum_product_fp4)
+    );
+  end else begin : no_fp4_adder_tree
+    assign sum_product_fp4 = '0;
+  end
+
+  // Unified format adder: handles FP8 + FP6 + FP4 (FP6/FP4 are zero when disabled)
+  fpnew_mxdotp_format_adder #(
+    .Fp6SumWidth ( FP6_SUM_WIDTH ),
+    .Fp4SumWidth ( FP4_SUM_WIDTH )
+  ) i_format_adder (
+    .sum_product_fp8(sum_product_fp8),
+    .sum_product_fp6(sum_product_fp6),
+    .sum_product_fp4(sum_product_fp4),
+    .sum_product(sum_product)
+  );
+
+  // ---------------
+  // Internal pipeline
+  // ---------------
+  // Pipeline output signals as non-arrays
+  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_q;
+  logic [SCALE_WIDTH:0]              scale_q2;
+  fp_dst_t                           operand_d_q2;
+  fpnew_pkg::fp_info_t               info_d_q;
+  fpnew_pkg::fp_format_e             dst_fmt_q2;
+  fpnew_pkg::roundmode_e             rnd_mode_q;
+  logic                              result_is_special_q;
+  logic [DST_WIDTH-1:0]              special_result_q;
+  fpnew_pkg::status_t                special_status_q;
+  // Internal pipeline signals, index i holds signal after i register stages
+  logic signed           [0:NUM_MID_REGS][FIXED_SUM_WIDTH-1:0]    mid_pipe_sum_product_q;
+  logic                  [0:NUM_MID_REGS][SCALE_WIDTH:0]          mid_pipe_scale_q;
+  fp_dst_t               [0:NUM_MID_REGS]                         mid_pipe_operand_d_q;
+  fpnew_pkg::fp_info_t   [0:NUM_MID_REGS]                         mid_pipe_info_d_q;
+  fpnew_pkg::fp_format_e [0:NUM_MID_REGS]                         mid_pipe_dst_fmt_q;
+  fpnew_pkg::roundmode_e [0:NUM_MID_REGS]                         mid_pipe_rnd_mode_q;
+  logic                  [0:NUM_MID_REGS]                         mid_pipe_res_is_spec_q;
+  logic                  [0:NUM_MID_REGS][DST_WIDTH-1:0]          mid_pipe_spec_res_q;
+  fpnew_pkg::status_t    [0:NUM_MID_REGS]                         mid_pipe_spec_stat_q;
+  TagType                [0:NUM_MID_REGS]                         mid_pipe_tag_q;
+  logic                  [0:NUM_MID_REGS]                         mid_pipe_mask_q;
+  AuxType                [0:NUM_MID_REGS]                         mid_pipe_aux_q;
+  logic                  [0:NUM_MID_REGS]                         mid_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_MID_REGS] mid_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from upstream logic
+  assign mid_pipe_sum_product_q[0] = sum_product;
+  assign mid_pipe_scale_q[0]       = scale;
+  assign mid_pipe_operand_d_q[0]   = operand_d;
+  assign mid_pipe_info_d_q[0]      = info_d;
+  assign mid_pipe_dst_fmt_q[0]     = dst_fmt_q;
+  assign mid_pipe_rnd_mode_q[0]    = inp_pipe_rnd_mode_q[NUM_INP_REGS];
+  assign mid_pipe_res_is_spec_q[0] = result_is_special;
+  assign mid_pipe_spec_res_q[0]    = special_result;
+  assign mid_pipe_spec_stat_q[0]   = special_status;
+  assign mid_pipe_tag_q[0]         = inp_pipe_tag_q[NUM_INP_REGS];
+  assign mid_pipe_mask_q[0]        = inp_pipe_mask_q[NUM_INP_REGS];
+  assign mid_pipe_aux_q[0]         = inp_pipe_aux_q[NUM_INP_REGS];
+  assign mid_pipe_valid_q[0]       = inp_pipe_valid_q[NUM_INP_REGS];
+  // Input stage: Propagate pipeline ready signal to input pipe
+  assign inp_pipe_ready[NUM_INP_REGS] = mid_pipe_ready[0];
+
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_MID_REGS; i++) begin : gen_inside_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign mid_pipe_ready[i] = mid_pipe_ready[i+1] | ~mid_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(mid_pipe_valid_q[i+1], mid_pipe_valid_q[i], mid_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipeline ready and a valid data item is present
+    assign reg_ena = mid_pipe_ready[i] & mid_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(mid_pipe_sum_product_q[i+1], mid_pipe_sum_product_q[i], reg_ena, '0)
+    `FFL(mid_pipe_scale_q[i+1],       mid_pipe_scale_q[i],       reg_ena, '0)
+    `FFL(mid_pipe_operand_d_q[i+1],   mid_pipe_operand_d_q[i],   reg_ena, '0)
+    `FFL(mid_pipe_info_d_q[i+1],      mid_pipe_info_d_q[i],      reg_ena, '0)
+    `FFL(mid_pipe_dst_fmt_q[i+1],     mid_pipe_dst_fmt_q[i],     reg_ena, fpnew_pkg::fp_format_e'(0))
+    `FFL(mid_pipe_rnd_mode_q[i+1],    mid_pipe_rnd_mode_q[i],    reg_ena, fpnew_pkg::RNE)
+    `FFL(mid_pipe_res_is_spec_q[i+1], mid_pipe_res_is_spec_q[i], reg_ena, '0)
+    `FFL(mid_pipe_spec_res_q[i+1],    mid_pipe_spec_res_q[i],    reg_ena, '0)
+    `FFL(mid_pipe_spec_stat_q[i+1],   mid_pipe_spec_stat_q[i],   reg_ena, '0)
+    `FFL(mid_pipe_tag_q[i+1],         mid_pipe_tag_q[i],         reg_ena, TagType'('0))
+    `FFL(mid_pipe_mask_q[i+1],        mid_pipe_mask_q[i],        reg_ena, '0)
+    `FFL(mid_pipe_aux_q[i+1],         mid_pipe_aux_q[i],         reg_ena, AuxType'('0))
+  end
+  // Output stage: assign selected pipe outputs to signals for later use
+  assign sum_product_q           = mid_pipe_sum_product_q[NUM_MID_REGS];
+  assign scale_q2                = mid_pipe_scale_q[NUM_MID_REGS];
+  assign operand_d_q2            = mid_pipe_operand_d_q[NUM_MID_REGS];
+  assign info_d_q                = mid_pipe_info_d_q[NUM_MID_REGS];
+  assign dst_fmt_q2              = mid_pipe_dst_fmt_q[NUM_MID_REGS];
+  assign rnd_mode_q              = mid_pipe_rnd_mode_q[NUM_MID_REGS];
+  assign result_is_special_q     = mid_pipe_res_is_spec_q[NUM_MID_REGS];
+  assign special_result_q        = mid_pipe_spec_res_q[NUM_MID_REGS];
+  assign special_status_q        = mid_pipe_spec_stat_q[NUM_MID_REGS];
+
+  // -----------------------------
+  // Accumulator shift data path
+  // -----------------------------
+  logic result_is_accumulator;
+  logic accumulator_is_right_shifted;
+
+  logic signed [9:0] accumulator_right_shift_amount;
+  logic signed [FIXED_SUM_WIDTH-1:0] accumulator_shifted;
+  logic signed [DST_PRECISION_BITS :0] signed_mantissa_d;
+  logic accumulator_sticky;
+  logic signed [DST_PRECISION_BITS-1:0] accumulator_remaining;
+
+  fpnew_mxdotp_accumulator_shift #(
+  ) i_accumulator_shift (
+    .sum_product_q(sum_product_q),
+    .scale_q2(scale_q2),
+    .operand_d_q2(operand_d_q2),
+    .info_d_q(info_d_q),
+    .dst_fmt_q2(dst_fmt_q2),
+    .accumulator_is_right_shifted(accumulator_is_right_shifted),
+    .accumulator_right_shift_amount(accumulator_right_shift_amount),
+    .accumulator_shifted(accumulator_shifted),
+    .result_is_accumulator(result_is_accumulator),
+    .accumulator_sticky(accumulator_sticky),
+    .signed_mantissa_d(signed_mantissa_d),
+    .accumulator_remaining(accumulator_remaining)
+  );
+
+  // -----------------
+  // Accumulator + SoP
+  // -----------------
+  logic signed [LZC_SUM_WIDTH-1:0] sum_product_accumulator_extended;
+
+  fpnew_mxdotp_add_accumulator_sop #(
+  ) i_add_accumulator_sop (
+    .sum_product_q(sum_product_q),
+    .accumulator_shifted(accumulator_shifted),
+    .accumulator_remaining(accumulator_remaining),
+    .sum_product_accumulator_extended(sum_product_accumulator_extended)
+  );
+
+  // --------------
+  // Normalization
+  // --------------
+  logic        [LZC_SUM_WIDTH-1:0]      sum_magnitude;
+  logic                                 final_sign;
+  logic        [DST_PRECISION_BITS-1:0] final_mantissa;
+  logic                                 sticky_after_norm;
+  logic signed [DST_EXP_WIDTH-1:0]      final_exponent;
+
+  fpnew_mxdotp_normalizer #(
+  ) i_normalizer (
+    .sum_product_accumulator_extended(sum_product_accumulator_extended),
+    .accumulator_sticky(accumulator_sticky),
+    .accumulator_is_right_shifted(accumulator_is_right_shifted),
+    .accumulator_right_shift_amount(accumulator_right_shift_amount),
+    .signed_mantissa_d(signed_mantissa_d),
+    .scale_q2(scale_q2),
+    .dst_fmt_q2(dst_fmt_q2),
+    .final_sign(final_sign),
+    .final_mantissa(final_mantissa),
+    .sticky_after_norm(sticky_after_norm),
+    .final_exponent(final_exponent),
+    .sum_magnitude(sum_magnitude)
+  );
+
+
+  // ----------------------------
+  // Rounding and classification
+  // ----------------------------
+  logic [1:0] round_sticky_bits;
+  logic [NUM_FORMATS-1:0][DST_WIDTH-1:0] fmt_result;
+
+  logic of_before_round, of_after_round; // overflow
+  logic uf_after_round; // underflow
+
+  fpnew_mxdotp_rounder #(
+    .FpDstFmtConfig ( FpDstFmtConfig )
+  ) i_rounder (
+    .clk_i(clk_i),
+    .rst_ni(rst_ni),
+    .final_sign(final_sign),
+    .final_mantissa(final_mantissa),
+    .final_exponent(final_exponent),
+    .sticky_after_norm(sticky_after_norm),
+    .sum_magnitude(sum_magnitude),
+    .dst_fmt_q2(dst_fmt_q2),
+    .rnd_mode_q(rnd_mode_q),
+    .round_sticky_bits(round_sticky_bits),
+    .fmt_result(fmt_result),
+    .of_before_round(of_before_round),
+    .of_after_round(of_after_round),
+    .uf_after_round(uf_after_round)
+  );
+
+  // -----------------
+  // Result selection
+  // -----------------
+  logic [DST_WIDTH-1:0] regular_result;
+  logic [DST_WIDTH-1:0] accumulator_result;
+  fpnew_pkg::status_t   regular_status;
+  fpnew_pkg::status_t   accumulator_status;
+
+  // Assemble regular result
+  assign regular_result    = fmt_result[dst_fmt_q2];
+  assign regular_status.NV = 1'b0; // only valid cases are handled in regular path
+  assign regular_status.DZ = 1'b0; // no divisions
+  assign regular_status.OF = of_before_round | of_after_round;   // rounding can introduce overflow
+  assign regular_status.UF = uf_after_round & regular_status.NX; // only inexact results raise UF
+  assign regular_status.NX = (| round_sticky_bits) | of_before_round | of_after_round;
+
+  // Accumulator dominates: NX if SoP was non-zero
+  assign accumulator_status.NV = 1'b0;
+  assign accumulator_status.DZ = 1'b0;
+  assign accumulator_status.OF = 1'b0;
+  assign accumulator_status.UF = 1'b0;
+  assign accumulator_status.NX = (sum_product_q != '0);
+
+  assign accumulator_result = (dst_fmt_q2 == fpnew_pkg::FP16ALT) ? {16'hFFFF, operand_d_q2[31:16]} :
+                              operand_d_q2;
+
+  // Final results for output pipeline
+  logic [DST_WIDTH-1:0] result_d;
+  fpnew_pkg::status_t   status_d;
+
+  // Select output depending on special case detection
+  assign result_d = result_is_special_q ? special_result_q : (result_is_accumulator ? accumulator_result : regular_result);
+  assign status_d = result_is_special_q ? special_status_q : (result_is_accumulator ? accumulator_status : regular_status);
+
+  // ----------------
+  // Output Pipeline
+  // ----------------
+  // Output pipeline signals, index i holds signal after i register stages
+  logic               [0:NUM_OUT_REGS][DST_WIDTH-1:0] out_pipe_result_q;
+  fpnew_pkg::status_t [0:NUM_OUT_REGS]                out_pipe_status_q;
+  TagType             [0:NUM_OUT_REGS]                out_pipe_tag_q;
+  logic               [0:NUM_OUT_REGS]                out_pipe_mask_q;
+  AuxType             [0:NUM_OUT_REGS]                out_pipe_aux_q;
+  logic               [0:NUM_OUT_REGS]                out_pipe_valid_q;
+  // Ready signal is combinatorial for all stages
+  logic [0:NUM_OUT_REGS] out_pipe_ready;
+
+  // Input stage: First element of pipeline is taken from inputs
+  assign out_pipe_result_q[0] = result_d;
+  assign out_pipe_status_q[0] = status_d;
+  assign out_pipe_tag_q[0]    = mid_pipe_tag_q[NUM_MID_REGS];
+  assign out_pipe_mask_q[0]   = mid_pipe_mask_q[NUM_MID_REGS];
+  assign out_pipe_aux_q[0]    = mid_pipe_aux_q[NUM_MID_REGS];
+  assign out_pipe_valid_q[0]  = mid_pipe_valid_q[NUM_MID_REGS];
+  // Input stage: Propagate pipeline ready signal to inside pipe
+  assign mid_pipe_ready[NUM_MID_REGS] = out_pipe_ready[0];
+  // Generate the register stages
+  for (genvar i = 0; i < NUM_OUT_REGS; i++) begin : gen_output_pipeline
+    // Internal register enable for this stage
+    logic reg_ena;
+    // Determine the ready signal of the current stage - advance the pipeline:
+    // 1. if the next stage is ready for our data
+    // 2. if the next stage only holds a bubble (not valid) -> we can pop it
+    assign out_pipe_ready[i] = out_pipe_ready[i+1] | ~out_pipe_valid_q[i+1];
+    // Valid: enabled by ready signal, synchronous clear with the flush signal
+    `FFLARNC(out_pipe_valid_q[i+1], out_pipe_valid_q[i], out_pipe_ready[i], flush_i, 1'b0, clk_i, rst_ni)
+    // Enable register if pipeline ready and a valid data item is present
+    assign reg_ena = out_pipe_ready[i] & out_pipe_valid_q[i];
+    // Generate the pipeline registers within the stages, use enable-registers
+    `FFL(out_pipe_result_q[i+1], out_pipe_result_q[i], reg_ena, '0)
+    `FFL(out_pipe_status_q[i+1], out_pipe_status_q[i], reg_ena, '0)
+    `FFL(out_pipe_tag_q[i+1],    out_pipe_tag_q[i],    reg_ena, TagType'('0))
+    `FFL(out_pipe_mask_q[i+1],   out_pipe_mask_q[i],   reg_ena, '0)
+    `FFL(out_pipe_aux_q[i+1],    out_pipe_aux_q[i],    reg_ena, AuxType'('0))
+  end
+  // Output stage: Ready travels backwards from output side, driven by downstream circuitry
+  assign out_pipe_ready[NUM_OUT_REGS] = out_ready_i;
+  // Output stage: assign module outputs
+  assign result_o        = out_pipe_result_q[NUM_OUT_REGS];
+  assign status_o        = out_pipe_status_q[NUM_OUT_REGS];
+  assign extension_bit_o = 1'b1; // always NaN-Box result
+  assign tag_o           = out_pipe_tag_q[NUM_OUT_REGS];
+  assign mask_o          = out_pipe_mask_q[NUM_OUT_REGS];
+  assign aux_o           = out_pipe_aux_q[NUM_OUT_REGS];
+  assign out_valid_o     = out_pipe_valid_q[NUM_OUT_REGS];
+  assign busy_o          = (| {inp_pipe_valid_q, mid_pipe_valid_q, out_pipe_valid_q});
+endmodule
diff --git a/src/fpnew_mxdotp_multi_wrapper.sv b/src/fpnew_mxdotp_multi_wrapper.sv
new file mode 100644
index 00000000..2efccc60
--- /dev/null
+++ b/src/fpnew_mxdotp_multi_wrapper.sv
@@ -0,0 +1,245 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Author: Gamze Islamoglu <gislamoglu@iis.ee.ethz.ch>
+
+module fpnew_mxdotp_multi_wrapper
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter int unsigned             LaneWidth       = 64,
+  parameter fpnew_pkg::fmt_logic_t   FpSrcFmtConfig  = '1,  // Supported FP source formats (FP8, FP8ALT, FP6, FP6ALT, FP4)
+  parameter fpnew_pkg::ifmt_logic_t  IntSrcFmtConfig = '1,  // Supported INT formats (INT8)
+  parameter fpnew_pkg::fmt_logic_t   FpDstFmtConfig  = '1,  // Supported FP destination formats (FP32, FP16ALT)
+  parameter int unsigned             Unroll          = 8,   // Unroll factor for FP6 extended operands, possible values: 1, 2, 4, 8
+  parameter int unsigned             NumPipeRegs     = 4,
+  parameter fpnew_pkg::pipe_config_t PipeConfig      = fpnew_pkg::BEFORE,
+  parameter type                     TagType         = logic,
+  parameter type                     AuxType         = logic,
+  parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
+  // Do not change
+  localparam int                     OPERAND_WIDTH    = LaneWidth,
+  localparam int                     UNROLL_IDX_WIDTH = (Unroll > 1) ? $clog2(Unroll) : 1
+) (
+  input logic                          clk_i,
+  input logic                          rst_ni,
+  // Input signals
+  input logic [2:0][OPERAND_WIDTH-1:0] operands_i, // 3 operands
+  input logic [NUM_FORMATS-1:0][2:0]   is_boxed_i, // 3 operands
+  input fpnew_pkg::roundmode_e         rnd_mode_i,
+  input fpnew_pkg::operation_e         op_i,
+  input logic                          op_mod_i,
+  input fpnew_pkg::fp_format_e         src_fmt_i,
+  input fpnew_pkg::int_format_e        int_fmt_i,
+  input fpnew_pkg::fp_format_e         dst_fmt_i,
+  input TagType                        tag_i,
+  input logic                          mask_i,
+  input AuxType                        aux_i,
+  // Input Handshake
+  input  logic                         in_valid_i,
+  output logic                         in_ready_o,
+  input  logic                         flush_i,
+  // Output signals
+  output logic [OPERAND_WIDTH-1:0]     result_o,
+  output fpnew_pkg::status_t           status_o,
+  output logic                         extension_bit_o,
+  output TagType                       tag_o,
+  output logic                         mask_o,
+  output AuxType                       aux_o,
+  // Output handshake
+  output logic                         out_valid_o,
+  input  logic                         out_ready_i,
+  // Indication of valid data in flight
+  output logic                         busy_o
+);
+
+  // -----------------
+  // Input processing
+  // -----------------
+  logic [VectorSize-1:0][SRC_WIDTH-1:0] local_src_fmt_operand_a;
+  logic [VectorSize-1:0][SRC_WIDTH-1:0] local_src_fmt_operand_b;
+  logic [1:0] local_src_fmt_operand_a_rem;
+  logic [1:0] local_src_fmt_operand_b_rem;
+  logic [1:0][SCALE_WIDTH-1:0] local_src_fmt_operand_c;
+  logic [NUM_FORMATS-1:0][DST_WIDTH-1:0] local_src_fmt_operand_d;
+  logic [NUM_FORMATS-1:0][NUM_OPERANDS-1:0] local_is_boxed;
+  logic [OPERAND_WIDTH-1:0] local_result;
+
+  // -------------------------
+  // Extended operands for FP6
+  // -------------------------
+
+  if (FpSrcFmtConfig[fpnew_pkg::FP6] || FpSrcFmtConfig[fpnew_pkg::FP6ALT]) begin : gen_fp6_operands
+
+    typedef enum logic [1:0] {
+      STEP0 = 2'b00,
+      STEP1 = 2'b01,
+      STEP2 = 2'b10
+    } fp6_step_e;
+
+    fp6_step_e step;
+
+    // Count for the number of FP6 extended operands processed
+    // Each 192b/6b = 32 FP6 operands are processed in 3 steps
+    logic [$clog2(3*Unroll)-1:0] count_q, count_d;
+    logic [UNROLL_IDX_WIDTH-1:0] unroll_index;
+
+    // Store the FP6 extended operands
+    logic [1:0][Unroll-1:0][3:0] local_fp6_stores_d, local_fp6_stores_q;
+    logic [1:0][3:0] local_fp6_stores;
+
+    if (Unroll > 1) begin : gen_unroll_idx
+      assign unroll_index = count_q[$clog2(Unroll)-1:0];
+    end else begin : gen_no_unroll
+      assign unroll_index = '0;
+    end
+
+    assign step = fp6_step_e'(count_q >> $clog2(Unroll));
+
+    always_comb begin
+      count_d            = count_q;
+      local_fp6_stores_d = local_fp6_stores_q;
+
+      local_fp6_stores = '0;
+
+      local_src_fmt_operand_a     = '0;
+      local_src_fmt_operand_b     = '0;
+      local_src_fmt_operand_a_rem = '0;
+      local_src_fmt_operand_b_rem = '0;
+
+      if (src_fmt_i == fpnew_pkg::FP6 || src_fmt_i == fpnew_pkg::FP6ALT) begin
+        if (step == STEP0) begin
+          local_src_fmt_operand_a = {4'b0000, operands_i[0][59:0]};
+          local_fp6_stores[0]     = operands_i[0][63:60];
+          local_src_fmt_operand_b = {4'b0000, operands_i[1][59:0]};
+          local_fp6_stores[1]     = operands_i[1][63:60];
+        end else if (step == STEP1) begin
+          local_src_fmt_operand_a     = {operands_i[0][59:0], local_fp6_stores_q[0][unroll_index][3:0]};
+          local_src_fmt_operand_a_rem = operands_i[0][61:60];
+          local_fp6_stores[0]         = {2'b00, operands_i[0][63:62]};
+          local_src_fmt_operand_b     = {operands_i[1][59:0], local_fp6_stores_q[1][unroll_index][3:0]};
+          local_src_fmt_operand_b_rem = operands_i[1][61:60];
+          local_fp6_stores[1]         = {2'b00, operands_i[1][63:62]};
+        end else if (step == STEP2) begin
+          local_src_fmt_operand_a     = {operands_i[0][61:0], local_fp6_stores_q[0][unroll_index][1:0]};
+          local_src_fmt_operand_a_rem = operands_i[0][63:62];
+          local_src_fmt_operand_b     = {operands_i[1][61:0], local_fp6_stores_q[1][unroll_index][1:0]};
+          local_src_fmt_operand_b_rem = operands_i[1][63:62];
+        end
+
+        if (in_valid_i && in_ready_o) begin
+          // Store the FP6 extended operands
+          local_fp6_stores_d[0][unroll_index] = local_fp6_stores[0];
+          local_fp6_stores_d[1][unroll_index] = local_fp6_stores[1];
+          count_d = count_q + 1;
+          if (count_d == 3 * Unroll) begin
+            count_d = '0;
+          end
+        end
+      end else begin
+        local_src_fmt_operand_a = operands_i[0];
+        local_src_fmt_operand_b = operands_i[1];
+      end
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+      if (!rst_ni) begin
+        count_q            <= '0;
+        local_fp6_stores_q <= '0;
+      end else begin
+        count_q            <= count_d;
+        local_fp6_stores_q <= local_fp6_stores_d;
+      end
+    end
+
+  end else begin : gen_no_fp6_operands
+    assign local_src_fmt_operand_a     = operands_i[0];
+    assign local_src_fmt_operand_b     = operands_i[1];
+    assign local_src_fmt_operand_a_rem = '0;
+    assign local_src_fmt_operand_b_rem = '0;
+  end
+
+  // ----------------------------------
+  // assign scale operands
+  // ----------------------------------
+  assign local_src_fmt_operand_c[1] = operands_i[2][(DST_WIDTH+SCALE_WIDTH)+:SCALE_WIDTH];
+  assign local_src_fmt_operand_c[0] = operands_i[2][DST_WIDTH+:SCALE_WIDTH];
+
+  // ----------------------------------
+  // assign operands with src format
+  // ----------------------------------
+  // NaN-boxing check
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_nanbox
+
+    localparam int unsigned FP_WIDTH         = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned FP_WIDTH_DST_MIN = fpnew_pkg::minimum(DST_WIDTH, FP_WIDTH);
+
+    always_comb begin : nanbox
+      local_src_fmt_operand_d[fmt] = '1;
+      local_src_fmt_operand_d[fmt][FP_WIDTH_DST_MIN-1:0] = operands_i[2][FP_WIDTH_DST_MIN-1:0];
+
+      for (int i = 0; i < VectorSize; i++) begin
+        local_is_boxed[fmt][i] = is_boxed_i[fmt][0];
+        local_is_boxed[fmt][i+VectorSize] = is_boxed_i[fmt][1];
+      end
+
+      local_is_boxed[fmt][2*VectorSize] = is_boxed_i[fmt][2];
+    end
+  end
+
+  fpnew_mxdotp_multi #(
+    .FpSrcFmtConfig     ( FpSrcFmtConfig  ),
+    .IntSrcFmtConfig    ( IntSrcFmtConfig ),
+    .FpDstFmtConfig     ( FpDstFmtConfig  ),
+    .NumPipeRegs        ( NumPipeRegs     ),
+    .PipeConfig         ( PipeConfig      ),
+    .TagType            ( TagType         ),
+    .AuxType            ( AuxType         )
+  ) i_fpnew_mxdotp_multi (
+    .clk_i,
+    .rst_ni,
+    .operands_a_i ( local_src_fmt_operand_a ),
+    .operands_b_i ( local_src_fmt_operand_b ),
+    .operands_a_fp6_rem_i ( local_src_fmt_operand_a_rem ),
+    .operands_b_fp6_rem_i ( local_src_fmt_operand_b_rem ),
+    .operands_c_i ( local_src_fmt_operand_c            ),
+    .operand_d_i  ( local_src_fmt_operand_d[dst_fmt_i] ),
+    .is_boxed_i   ( local_is_boxed                     ),
+    .rnd_mode_i,
+    .op_i,
+    .op_mod_i,
+    .src_fmt_i, // format of the multiplicands
+    .int_fmt_i, // format of the multiplicands if they are integers
+    .dst_fmt_i, // format of the addend and result
+    .tag_i,
+    .mask_i,
+    .aux_i,
+    .in_valid_i,
+    .in_ready_o,
+    .flush_i,
+    .result_o     ( local_result[DST_WIDTH-1:0] ),
+    .status_o,
+    .extension_bit_o,
+    .tag_o,
+    .mask_o,
+    .aux_o,
+    .out_valid_o,
+    .out_ready_i,
+    .busy_o
+  );
+
+  if (OPERAND_WIDTH > DST_WIDTH) begin
+    assign local_result[OPERAND_WIDTH-1:DST_WIDTH] = '1;
+  end
+  assign result_o = local_result;
+
+endmodule
diff --git a/src/fpnew_opgroup_block.sv b/src/fpnew_opgroup_block.sv
index db2c3032..f03119aa 100644
--- a/src/fpnew_opgroup_block.sv
+++ b/src/fpnew_opgroup_block.sv
@@ -21,6 +21,8 @@ module fpnew_opgroup_block #(
   parameter fpnew_pkg::divsqrt_unit_t   DivSqrtSel    = fpnew_pkg::THMULTI,
   parameter fpnew_pkg::fmt_logic_t      FpFmtMask     = '1,
   parameter fpnew_pkg::ifmt_logic_t     IntFmtMask    = '1,
+  parameter fpnew_pkg::fmt_logic_t      MxFpFmtMask   = '0,  // MX-specific FP formats
+  parameter fpnew_pkg::ifmt_logic_t     MxIntFmtMask  = '0,  // MX-specific INT formats
   parameter fpnew_pkg::fmt_unsigned_t   FmtPipeRegs   = '{default: 0},
   parameter fpnew_pkg::fmt_unit_types_t FmtUnitTypes  = '{default: fpnew_pkg::PARALLEL},
   parameter fpnew_pkg::pipe_config_t    PipeConfig    = fpnew_pkg::BEFORE,
@@ -178,15 +180,17 @@ module fpnew_opgroup_block #(
     assign in_valid = in_valid_i & (FmtUnitTypes[dst_fmt_i] == fpnew_pkg::MERGED);
 
     fpnew_opgroup_multifmt_slice #(
-      .OpGroup       ( OpGroup          ),
-      .Width         ( Width            ),
-      .FpFmtConfig   ( FpFmtMask        ),
-      .IntFmtConfig  ( IntFmtMask       ),
-      .EnableVectors ( EnableVectors    ),
-      .DivSqrtSel    ( DivSqrtSel       ),
-      .NumPipeRegs   ( REG              ),
-      .PipeConfig    ( PipeConfig       ),
-      .TagType       ( TagType          ),
+      .OpGroup        ( OpGroup          ),
+      .Width          ( Width            ),
+      .FpFmtConfig    ( FpFmtMask        ),
+      .IntFmtConfig   ( IntFmtMask       ),
+      .MxFpFmtConfig  ( MxFpFmtMask      ),
+      .MxIntFmtConfig ( MxIntFmtMask     ),
+      .EnableVectors  ( EnableVectors    ),
+      .DivSqrtSel     ( DivSqrtSel       ),
+      .NumPipeRegs    ( REG              ),
+      .PipeConfig     ( PipeConfig       ),
+      .TagType        ( TagType          ),
       .StochasticRndImplementation ( StochasticRndImplementation )
     ) i_multifmt_slice (
       .clk_i,
diff --git a/src/fpnew_opgroup_multifmt_slice.sv b/src/fpnew_opgroup_multifmt_slice.sv
index f5991cbd..5409b34a 100644
--- a/src/fpnew_opgroup_multifmt_slice.sv
+++ b/src/fpnew_opgroup_multifmt_slice.sv
@@ -16,16 +16,18 @@
 `include "common_cells/registers.svh"
 
 module fpnew_opgroup_multifmt_slice #(
-  parameter fpnew_pkg::opgroup_e      OpGroup       = fpnew_pkg::CONV,
-  parameter int unsigned              Width         = 64,
+  parameter fpnew_pkg::opgroup_e      OpGroup        = fpnew_pkg::CONV,
+  parameter int unsigned              Width          = 64,
   // FPU configuration
-  parameter fpnew_pkg::fmt_logic_t    FpFmtConfig   = '1,
-  parameter fpnew_pkg::ifmt_logic_t   IntFmtConfig  = '1,
-  parameter logic                     EnableVectors = 1'b1,
-  parameter fpnew_pkg::divsqrt_unit_t DivSqrtSel    = fpnew_pkg::THMULTI,
-  parameter int unsigned              NumPipeRegs   = 0,
-  parameter fpnew_pkg::pipe_config_t  PipeConfig    = fpnew_pkg::BEFORE,
-  parameter type                      TagType       = logic,
+  parameter fpnew_pkg::fmt_logic_t    FpFmtConfig    = '1,
+  parameter fpnew_pkg::ifmt_logic_t   IntFmtConfig   = '1,
+  parameter fpnew_pkg::fmt_logic_t    MxFpFmtConfig  = '0,  // MX-specific FP formats
+  parameter fpnew_pkg::ifmt_logic_t   MxIntFmtConfig = '0,  // MX-specific INT formats
+  parameter logic                     EnableVectors  = 1'b1,
+  parameter fpnew_pkg::divsqrt_unit_t DivSqrtSel     = fpnew_pkg::THMULTI,
+  parameter int unsigned              NumPipeRegs    = 0,
+  parameter fpnew_pkg::pipe_config_t  PipeConfig     = fpnew_pkg::BEFORE,
+  parameter type                      TagType        = logic,
   parameter fpnew_pkg::rsr_impl_t     StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
   localparam int unsigned NUM_OPERANDS = fpnew_pkg::num_operands(OpGroup),
@@ -82,11 +84,22 @@ The SDOTP operations compute on 8b inputs producing 16b outputs \
 or on 16b inputs producing 32b outputs");
   end
 
+  if (OpGroup == fpnew_pkg::MXDOTP) begin
+    if (Width != 64) begin
+      $fatal(1, "MXDOTP only supported on 64b CVFPU instances, got Width=%0d", Width);
+    end else if (!FpFmtConfig[fpnew_pkg::FP32]) begin
+      $fatal(1, "MXDOTP requires FP32 to be enabled as a destination format. Please enable FP32 in FpFmtConfig");
+    end else if (!MxFpFmtConfig[fpnew_pkg::FP8]) begin
+      $fatal(1, "MXDOTP requires FP8 to be enabled as a source format. Please enable FP8 in MxFpFmtConfig.");
+    end
+  end
+
   localparam int unsigned MAX_FP_WIDTH   = fpnew_pkg::max_fp_width(FpFmtConfig);
   localparam int unsigned MAX_INT_WIDTH  = fpnew_pkg::max_int_width(IntFmtConfig);
   localparam int unsigned NUM_LANES = fpnew_pkg::max_num_lanes(Width, FpFmtConfig, 1'b1);
   localparam int unsigned NUM_DIVSQRT_LANES = fpnew_pkg::num_divsqrt_lanes(Width, FpFmtConfig, 1'b1, DivSqrtSel);
   localparam int unsigned NUM_DOTP_LANES = fpnew_pkg::num_dotp_lanes(Width, FpFmtConfig);
+  localparam int unsigned NUM_MX_LANES = fpnew_pkg::num_mxdotp_lanes(Width, MxFpFmtConfig, MxIntFmtConfig);
   localparam int unsigned NUM_INT_FORMATS = fpnew_pkg::NUM_INT_FORMATS;
   // We will send the format information along with the data
   localparam int unsigned FMT_BITS =
@@ -195,6 +208,16 @@ or on 16b inputs producing 32b outputs");
     localparam int unsigned DOTP_MAX_FMT_WIDTH = fpnew_pkg::max_fp_width(DOTP_FORMATS);
     localparam int unsigned DOTP_WIDTH = fpnew_pkg::minimum(2*DOTP_MAX_FMT_WIDTH, Width);
 
+    // MXDOTP-specific parameters
+    localparam fpnew_pkg::lane_formats_t MXDOTP_FORMATS =
+        fpnew_pkg::get_mxdotp_formats(Width, FpFmtConfig, MxFpFmtConfig, MxIntFmtConfig, LANE);
+    localparam fpnew_pkg::fmt_logic_t MXDOTP_FP_FORMATS =
+        MXDOTP_FORMATS.src_fp_formats;
+    localparam fpnew_pkg::ifmt_logic_t MXDOTP_INT_FORMATS =
+        MXDOTP_FORMATS.src_int_formats;
+    localparam fpnew_pkg::fmt_logic_t MXDOTP_DST_FORMATS =
+        MXDOTP_FORMATS.dst_fp_formats;
+
     // Lane parameters from Opgroup
     localparam fpnew_pkg::fmt_logic_t LANE_FORMATS = (OpGroup == fpnew_pkg::CONV) ? CONV_FORMATS :
                                                      (OpGroup == fpnew_pkg::DOTP) ? DOTP_FORMATS :
@@ -206,7 +229,9 @@ or on 16b inputs producing 32b outputs");
 
     // Generate instances only if needed, lane 0 always generated
     if ((lane == 0) || (EnableVectors & (!(OpGroup == fpnew_pkg::DOTP && (lane >= NUM_DOTP_LANES))
-                                        && !(OpGroup == fpnew_pkg::DIVSQRT && (lane >= NUM_DIVSQRT_LANES))))) begin : active_lane
+                                        && !(OpGroup == fpnew_pkg::DIVSQRT && (lane >= NUM_DIVSQRT_LANES))
+                                        && !(OpGroup == fpnew_pkg::MXDOTP && (lane >= NUM_MX_LANES))
+                                        ))) begin : active_lane
       logic in_valid, out_valid, out_ready; // lane-local handshake
 
       logic [NUM_OPERANDS-1:0][LANE_WIDTH-1:0] local_operands;  // lane-local oprands
@@ -215,7 +240,8 @@ or on 16b inputs producing 32b outputs");
 
       logic lane_is_used;
       assign lane_is_used = (LANE_FORMATS[src_fmt_i] & ~is_up_cast) |
-                            (LANE_FORMATS[dst_fmt_i] &  is_up_cast) | (OpGroup == fpnew_pkg::DIVSQRT);
+                            (LANE_FORMATS[dst_fmt_i] &  is_up_cast) | 
+                            (OpGroup == fpnew_pkg::DIVSQRT) | (OpGroup == fpnew_pkg::MXDOTP);
       assign in_valid = in_valid_i & ((lane == 0) | vectorial_op) & lane_is_used; // upper lanes only for vectors
 
       // Slice out the operands for this lane, upper bits are ignored in the unit
@@ -462,6 +488,42 @@ or on 16b inputs producing 32b outputs");
           .out_ready_i     ( out_ready           ),
           .busy_o          ( lane_busy[lane]     )
         );
+      end else if (OpGroup == fpnew_pkg::MXDOTP) begin : lane_instance
+        fpnew_mxdotp_multi_wrapper #(
+          .FpSrcFmtConfig  ( MXDOTP_FP_FORMATS    ),
+          .IntSrcFmtConfig ( MXDOTP_INT_FORMATS   ),
+          .FpDstFmtConfig  ( MXDOTP_DST_FORMATS   ),
+          .NumPipeRegs     ( NumPipeRegs          ),
+          .PipeConfig      ( PipeConfig           ),
+          .TagType         ( TagType              ),
+          .AuxType         ( logic [AUX_BITS-1:0] )
+        ) i_fpnew_mxdotp_multi_wrapper (
+          .clk_i,
+          .rst_ni,
+          .operands_i      ( local_operands[2:0]  ),
+          .is_boxed_i,
+          .rnd_mode_i,
+          .op_i,
+          .op_mod_i,
+          .src_fmt_i,
+          .int_fmt_i,
+          .dst_fmt_i,
+          .tag_i,
+          .mask_i          ( simd_mask_i[lane]   ),
+          .aux_i           ( aux_data            ),
+          .in_valid_i      ( in_valid            ),
+          .in_ready_o      ( lane_in_ready[lane] ),
+          .flush_i,
+          .result_o        ( op_result           ),
+          .status_o        ( op_status           ),
+          .extension_bit_o ( lane_ext_bit[lane]  ),
+          .tag_o           ( lane_tags[lane]     ),
+          .mask_o          ( lane_masks[lane]    ),
+          .aux_o           ( lane_aux[lane]      ),
+          .out_valid_o     ( out_valid           ),
+          .out_ready_i     ( out_ready           ),
+          .busy_o          ( lane_busy[lane]     )
+        );
       end // ADD OTHER OPTIONS HERE
 
       // Handshakes are only done if the lane is actually used
diff --git a/src/fpnew_pkg.sv b/src/fpnew_pkg.sv
index 1e8ce099..637b2e6e 100644
--- a/src/fpnew_pkg.sv
+++ b/src/fpnew_pkg.sv
@@ -26,6 +26,9 @@ package fpnew_pkg;
   // | FP8        | binary8          |  8 bit | 5        | 2
   // | FP16ALT    | binary16alt      | 16 bit | 8        | 7
   // | FP8ALT     | binary8alt       |  8 bit | 4        | 3
+  // | FP6        | binary6          |  6 bit | 3        | 2
+  // | FP6ALT     | binary6alt       |  6 bit | 2        | 3
+  // | FP4        | binary4          |  4 bit | 2        | 1
   // *NOTE:* Add new formats only at the end of the enumeration for backwards compatibilty!
 
   // Encoding for a format
@@ -34,7 +37,7 @@ package fpnew_pkg;
     int unsigned man_bits;
   } fp_encoding_t;
 
-  localparam int unsigned NUM_FP_FORMATS = 6; // change me to add formats
+  localparam int unsigned NUM_FP_FORMATS = 9; // change me to add formats
   localparam int unsigned FP_FORMAT_BITS = $clog2(NUM_FP_FORMATS);
 
   // FP formats
@@ -44,7 +47,10 @@ package fpnew_pkg;
     FP16    = 'd2,
     FP8     = 'd3,
     FP16ALT = 'd4,
-    FP8ALT  = 'd5
+    FP8ALT  = 'd5,
+    FP6     = 'd6,
+    FP6ALT  = 'd7,
+    FP4     = 'd8
     // add new formats here
   } fp_format_e;
 
@@ -55,17 +61,20 @@ package fpnew_pkg;
     '{5,  10}, // IEEE binary16 (half)
     '{5,  2},  // custom binary8
     '{8,  7},  // custom binary16alt
-    '{4,  3}   // custom binary8alt
+    '{4,  3},  // custom binary8alt
+    '{3,  2},  // custom binary6
+    '{2,  3},  // custom binary6alt
+    '{2,  1}   // custom binary4
     // add new formats here
   };
 
   typedef logic [0:NUM_FP_FORMATS-1]       fmt_logic_t;    // Logic indexed by FP format (for masks)
   typedef logic [0:NUM_FP_FORMATS-1][31:0] fmt_unsigned_t; // Unsigned indexed by FP format
 
-  localparam fmt_logic_t CPK_FORMATS  = 6'b110000; // FP32 and FP64 can provide CPK only
+  localparam fmt_logic_t CPK_FORMATS  = 9'b110000000; // FP32 and FP64 can provide CPK only
   // FP32, FP64 cannot be provided for DOTP
   // Small hack: FP32 only enabled for wide enough wrapper input widths for vsum.s instruction
-  localparam fmt_logic_t DOTP_FORMATS = 6'b101111;
+  localparam fmt_logic_t DOTP_FORMATS = 9'b101111000;
 
   // ---------
   // INT TYPES
@@ -110,14 +119,28 @@ package fpnew_pkg;
 
   typedef logic [0:NUM_INT_FORMATS-1] ifmt_logic_t; // Logic indexed by INT format (for masks)
 
+  // Combined format struct for operations that need FP, INT, and destination formats
+  typedef struct packed {
+    fmt_logic_t  src_fp_formats;
+    ifmt_logic_t src_int_formats;
+    fmt_logic_t  dst_fp_formats;
+  } lane_formats_t;
+
+  // MXDOTP format masks
+  localparam lane_formats_t MXDOTP_FORMATS_MASK = '{
+    src_fp_formats:  9'b000101111,  // FP8, FP8ALT, FP6, FP6ALT, FP4
+    src_int_formats: 4'b1000,       // INT8
+    dst_fp_formats:  9'b100010000   // FP32, FP16ALT
+  };
+
   // --------------
   // FP OPERATIONS
   // --------------
-  localparam int unsigned NUM_OPGROUPS = 5;
+  localparam int unsigned NUM_OPGROUPS = 6;
 
   // Each FP operation belongs to an operation group
   typedef enum logic [2:0] {
-    ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP
+    ADDMUL, DIVSQRT, NONCOMP, CONV, DOTP, MXDOTP
   } opgroup_e;
 
   localparam int unsigned OP_BITS = 5;
@@ -127,7 +150,8 @@ package fpnew_pkg;
     DIV, SQRT,                   // DIVSQRT operation group
     SGNJ, MINMAX, CMP, CLASSIFY, // NONCOMP operation group
     F2F, F2I, I2F, CPKAB, CPKCD, // CONV operation group
-    SDOTP, EXVSUM, VSUM          // DOTP operation group
+    SDOTP, EXVSUM, VSUM,         // DOTP operation group
+    MXDOTPF, MXDOTPI             // MXDOTP operation group
   } operation_e;
 
   // -------------
@@ -226,56 +250,70 @@ package fpnew_pkg;
     int unsigned Width;
     logic        EnableVectors;
     logic        EnableNanBox;
-    fmt_logic_t  FpFmtMask;
-    ifmt_logic_t IntFmtMask;
+    fmt_logic_t  FpFmtMask;    // Standard FP formats for all opgroups
+    ifmt_logic_t IntFmtMask;   // Standard INT formats for all opgroups
+    fmt_logic_t  MxFpFmtMask;  // MX-specific FP formats (FP6, FP6ALT, FP4, plus FP8/FP8ALT)
+    ifmt_logic_t MxIntFmtMask; // MX-specific INT formats (INT8)
   } fpu_features_t;
 
   localparam fpu_features_t RV64D = '{
     Width:         64,
     EnableVectors: 1'b0,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b110000,
-    IntFmtMask:    4'b0011
+    FpFmtMask:     9'b110000000,
+    IntFmtMask:    4'b0011,
+    MxFpFmtMask:   9'b0,         // No MX support
+    MxIntFmtMask:  4'b0
   };
 
   localparam fpu_features_t RV32D = '{
     Width:         64,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b110000,
-    IntFmtMask:    4'b0010
+    FpFmtMask:     9'b110000000,
+    IntFmtMask:    4'b0010,
+    MxFpFmtMask:   9'b0,         // No MX support
+    MxIntFmtMask:  4'b0
   };
 
   localparam fpu_features_t RV32F = '{
     Width:         32,
     EnableVectors: 1'b0,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b100000,
-    IntFmtMask:    4'b0010
+    FpFmtMask:     9'b100000000,
+    IntFmtMask:    4'b0010,
+    MxFpFmtMask:   9'b0,         // No MX support
+    MxIntFmtMask:  4'b0
   };
 
   localparam fpu_features_t RV64D_Xsflt = '{
     Width:         64,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b111111,
-    IntFmtMask:    4'b1111
+    FpFmtMask:     9'b111111000,  // Standard formats (not including FP6, FP6ALT, FP4)
+    IntFmtMask:    4'b1111,
+    MxFpFmtMask:   9'b000101111,  // MX formats: FP8, FP8ALT, FP6, FP6ALT, FP4
+    MxIntFmtMask:  4'b1000        // INT8 for MX operations
   };
 
   localparam fpu_features_t RV32F_Xsflt = '{
     Width:         32,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b101111,
-    IntFmtMask:    4'b1110
+    FpFmtMask:     9'b101111000,
+    IntFmtMask:    4'b1110,
+    MxFpFmtMask:   9'b0,         // No MX support (32-bit width insufficient)
+    MxIntFmtMask:  4'b0
   };
 
   localparam fpu_features_t RV32F_Xf16alt_Xfvec = '{
     Width:         32,
     EnableVectors: 1'b1,
     EnableNanBox:  1'b1,
-    FpFmtMask:     6'b100010,
-    IntFmtMask:    4'b0110
+    FpFmtMask:     9'b100010000,
+    IntFmtMask:    4'b0110,
+    MxFpFmtMask:   9'b0,         // No MX support
+    MxIntFmtMask:  4'b0
   };
 
 
@@ -292,7 +330,8 @@ package fpnew_pkg;
                   '{default: MERGED},   // DIVSQRT
                   '{default: PARALLEL}, // NONCOMP
                   '{default: MERGED},   // CONV
-                  '{default: DISABLED}},  // DOTP
+                  '{default: DISABLED},  // DOTP
+                  '{default: DISABLED}}, // MXDOTP
     PipeConfig: BEFORE
   };
 
@@ -302,7 +341,8 @@ package fpnew_pkg;
                   '{default: DISABLED}, // DIVSQRT
                   '{default: PARALLEL}, // NONCOMP
                   '{default: MERGED},   // CONV
-                  '{default: MERGED}},  // DOTP
+                  '{default: MERGED},   // DOTP
+                  '{default: MERGED}},  // MXDOTP
     PipeConfig: BEFORE
   };
 
@@ -425,6 +465,7 @@ package fpnew_pkg;
       SGNJ, MINMAX, CMP, CLASSIFY: return NONCOMP;
       F2F, F2I, I2F, CPKAB, CPKCD: return CONV;
       SDOTP, EXVSUM, VSUM:         return DOTP;
+      MXDOTPF, MXDOTPI:            return MXDOTP;
       default:                     return NONCOMP;
     endcase
   endfunction
@@ -437,6 +478,7 @@ package fpnew_pkg;
       NONCOMP: return 2;
       CONV:    return 3; // vectorial casts use 3 operands
       DOTP:    return 3; // splitting into 5 operands done in wrapper
+      MXDOTP:  return 3; // splitting into 4 operands done in wrapper
       default: return 0;
     endcase
   endfunction
@@ -454,7 +496,7 @@ package fpnew_pkg;
     // Returns the maximum number of lanes in the FPU according to width, format config and vectors
   function automatic int unsigned num_divsqrt_lanes(int unsigned width, fmt_logic_t cfg, logic vec, divsqrt_unit_t DivSqrtSel);
     automatic fmt_logic_t cfg_tmp;
-    cfg_tmp = (DivSqrtSel == THMULTI) ? cfg & 6'b111010 : cfg;
+    cfg_tmp = (DivSqrtSel == THMULTI) ? cfg & 9'b111010000 : cfg;
     return vec ? width / min_fp_width(cfg_tmp) : 1; // if no vectors, only one lane
   endfunction
 
@@ -514,13 +556,43 @@ package fpnew_pkg;
     automatic fmt_logic_t mask;
     int unsigned nr_16to32bit_lanes = (cfg[FP32]) ? (width / 32) : 0;
     if (lane_no < nr_16to32bit_lanes)
-      mask = 6'b101111;  //lane should be 16-bit -> 32-bit
+      mask = 9'b101111000;  //lane should be 16-bit -> 32-bit
     else
-      mask = 6'b001111;  //lane should be  8-bit -> 16-bit
+      mask = 9'b001111000;  //lane should be  8-bit -> 16-bit
     res = cfg & mask;
     return res;
   endfunction
 
+  // Returns how many MXDOTP lanes should be generated
+  function automatic int num_mxdotp_lanes(int unsigned width,
+                                          fmt_logic_t mx_fp_cfg,
+                                          ifmt_logic_t mx_int_cfg);
+    // MXDOTP is single-lane, non-vectorial
+    // Check if any MX source format is enabled (FP8, FP8ALT, FP6, FP6ALT, FP4) or INT8
+    return (width == 64 && (|(mx_fp_cfg & MXDOTP_FORMATS_MASK.src_fp_formats) ||
+                            |(mx_int_cfg & MXDOTP_FORMATS_MASK.src_int_formats))) ? 1 : 0;
+  endfunction
+
+  // Returns all format masks for MXDOTP operations
+  // Note: Assumes width == 64 (validated at instantiation)
+  function automatic lane_formats_t get_mxdotp_formats(int unsigned width,
+                                                       fmt_logic_t fp_cfg,
+                                                       fmt_logic_t mx_fp_cfg,
+                                                       ifmt_logic_t mx_int_cfg,
+                                                       int unsigned lane_no);
+    automatic lane_formats_t res;
+
+    // Source FP formats from MX config: FP8, FP8ALT, FP6, FP6ALT, FP4
+    res.src_fp_formats = mx_fp_cfg & MXDOTP_FORMATS_MASK.src_fp_formats;
+
+    // Source INT formats from MX config: INT8 only
+    res.src_int_formats = mx_int_cfg & MXDOTP_FORMATS_MASK.src_int_formats;
+
+    // Destination formats from standard FP config: FP32 and FP16ALT
+    res.dst_fp_formats = fp_cfg & MXDOTP_FORMATS_MASK.dst_fp_formats;
+    return res;
+  endfunction
+
   // Returns the dotp dest FP format string
   function automatic fmt_logic_t get_dotp_dst_fmts(fmt_logic_t cfg, fmt_logic_t src_cfg);
     automatic fmt_logic_t res;
@@ -529,7 +601,10 @@ package fpnew_pkg;
             cfg[FP16] && (src_cfg[FP8] || src_cfg[FP8ALT]),
             cfg[FP8],                                           // FP8 supported as dstFmt for VSUM
             cfg[FP16ALT] && (src_cfg[FP8] || src_cfg[FP8ALT]),
-            cfg[FP8ALT]                                         // FP8ALT supported as dstFmt for VSUM
+            cfg[FP8ALT],                                        // FP8ALT supported as dstFmt for VSUM
+            1'b0,                                               // FP6 not supported as dstFmt
+            1'b0,                                               // FP6ALT not supported as dstFmt
+            1'b0                                                // FP4 not supported as dstFmt
     };
     return res;
   endfunction
diff --git a/src/fpnew_sdotp_multi_wrapper.sv b/src/fpnew_sdotp_multi_wrapper.sv
index d402b67a..edbbea9e 100644
--- a/src/fpnew_sdotp_multi_wrapper.sv
+++ b/src/fpnew_sdotp_multi_wrapper.sv
@@ -26,7 +26,7 @@ module fpnew_sdotp_multi_wrapper #(
   parameter type                     AuxType     = logic,
   parameter fpnew_pkg::rsr_impl_t    StochasticRndImplementation = fpnew_pkg::DEFAULT_NO_RSR,
   // Do not change
-  localparam fpnew_pkg::fmt_logic_t FpSrcFmtConfig = FpFmtConfig[0] ? (FpFmtConfig & 6'b001111) : (FpFmtConfig & 6'b000101),
+  localparam fpnew_pkg::fmt_logic_t FpSrcFmtConfig = FpFmtConfig[0] ? (FpFmtConfig & 9'b001111000) : (FpFmtConfig & 9'b000101000),
   localparam fpnew_pkg::fmt_logic_t FpDstFmtConfig = fpnew_pkg::get_dotp_dst_fmts(FpFmtConfig, FpSrcFmtConfig),
   localparam int                    SRC_WIDTH      = fpnew_pkg::maximum(fpnew_pkg::max_fp_width(FpSrcFmtConfig), 1),
   localparam int                    DST_WIDTH      = fpnew_pkg::maximum(2*fpnew_pkg::max_fp_width(FpSrcFmtConfig), 1), // do not change, current assumption of sdotpex_multi
diff --git a/src/fpnew_top.sv b/src/fpnew_top.sv
index b564286d..a483df72 100644
--- a/src/fpnew_top.sv
+++ b/src/fpnew_top.sv
@@ -125,6 +125,8 @@ module fpnew_top #(
       .DivSqrtSel    ( DivSqrtSel                      ),
       .FpFmtMask     ( Features.FpFmtMask              ),
       .IntFmtMask    ( Features.IntFmtMask             ),
+      .MxFpFmtMask   ( Features.MxFpFmtMask            ),
+      .MxIntFmtMask  ( Features.MxIntFmtMask           ),
       .FmtPipeRegs   ( Implementation.PipeRegs[opgrp]  ),
       .FmtUnitTypes  ( Implementation.UnitTypes[opgrp] ),
       .PipeConfig    ( Implementation.PipeConfig       ),
diff --git a/src/mxdotp/fpnew_mxdotp_multi_modules.sv b/src/mxdotp/fpnew_mxdotp_multi_modules.sv
new file mode 100644
index 00000000..5b558e2b
--- /dev/null
+++ b/src/mxdotp/fpnew_mxdotp_multi_modules.sv
@@ -0,0 +1,987 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Author: Gamze Islamoglu <gislamoglu@iis.ee.ethz.ch>
+
+// Classifies and unpacks input operands (FP8/FP6/FP4 vectors, scales, accumulator) into sign/exponent/mantissa
+// fields and fp_info structs. Converts unsigned scales (0-255) to signed offsets (-127 to +128).
+module fpnew_mxdotp_classifier
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter fpnew_pkg::fmt_logic_t FpSrcFmtConfig = MxdotpSrcFpFmtConfig,
+  parameter fpnew_pkg::fmt_logic_t FpDstFmtConfig = MxdotpDstFpFmtConfig,
+  parameter int unsigned           FP6VectorSize  = 3,
+  parameter int unsigned           FP4VectorSize  = 5,
+  parameter int unsigned           NumInpRegs     = 0
+) (
+  // Input signals
+  input logic [2*VectorSize-1:0][SRC_WIDTH-1:0] operands_post_inp_pipe,
+  input logic [2*FP6VectorSize-1:0][SRC_WIDTH-1:0] fp6_operands_post_inp_pipe,
+  input logic [2*FP4VectorSize-1:0][SRC_WIDTH-1:0] fp4_operands_post_inp_pipe,
+  input logic signed [1:0][SCALE_WIDTH-1:0] operands_c_q,
+  input logic [DST_WIDTH-1:0] operand_d_q,
+  input logic [0:NumInpRegs][NUM_FORMATS-1:0][NUM_OPERANDS-1:0] inp_pipe_is_boxed_q,
+  input fpnew_pkg::fp_format_e src_fmt_q,
+  input logic src_is_int,
+  input fpnew_pkg::fp_format_e dst_fmt_q,
+  input logic [0:NumInpRegs] inp_pipe_op_mod_q,
+  // Output signals
+  output fpnew_pkg::fp_info_t [VectorSize-1:0] info_a,
+  output fpnew_pkg::fp_info_t [FP6VectorSize-1:0] fp6_info_a,
+  output fpnew_pkg::fp_info_t [FP4VectorSize-1:0] fp4_info_a,
+  output fpnew_pkg::fp_info_t [VectorSize-1:0] info_b,
+  output fpnew_pkg::fp_info_t [FP6VectorSize-1:0] fp6_info_b,
+  output fpnew_pkg::fp_info_t [FP4VectorSize-1:0] fp4_info_b,
+  output fpnew_pkg::fp_info_t [1:0] info_c,
+  output fpnew_pkg::fp_info_t info_d,
+  output fp_src_t [VectorSize-1:0] operands_a,
+  output fp6_src_t [FP6VectorSize-1:0] fp6_operands_a,
+  output fp4_src_t [FP4VectorSize-1:0] fp4_operands_a,
+  output fp_src_t [VectorSize-1:0] operands_b,
+  output fp6_src_t [FP6VectorSize-1:0] fp6_operands_b,
+  output fp4_src_t [FP4VectorSize-1:0] fp4_operands_b,
+  output logic signed [1:0][SCALE_WIDTH-1:0] operands_c,
+  output fp_dst_t operand_d
+);
+
+  // -----------------
+  // Source operands
+  // -----------------
+  logic        [NUM_FORMATS-1:0][2*VectorSize-1:0]                     fmt_sign;
+  logic signed [NUM_FORMATS-1:0][2*VectorSize-1:0][SUPER_EXP_BITS-1:0] fmt_exponent;
+  logic        [NUM_FORMATS-1:0][2*VectorSize-1:0][SUPER_MAN_BITS-1:0] fmt_mantissa;
+
+  fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][NUM_OPERANDS-1:0] info_q;
+
+  // FP6
+  logic        [NUM_FORMATS-1:0][2*FP6VectorSize-1:0]                   fp6_fmt_sign;
+  logic signed [NUM_FORMATS-1:0][2*FP6VectorSize-1:0][FP6_EXP_BITS-1:0] fp6_fmt_exponent;
+  logic        [NUM_FORMATS-1:0][2*FP6VectorSize-1:0][FP6_MAN_BITS-1:0] fp6_fmt_mantissa;
+
+  fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][2*FP6VectorSize-1:0] fp6_info_q;
+
+  // FP4
+  logic        [NUM_FORMATS-1:0][2*FP4VectorSize-1:0]                   fp4_fmt_sign;
+  logic signed [NUM_FORMATS-1:0][2*FP4VectorSize-1:0][FP4_EXP_BITS-1:0] fp4_fmt_exponent;
+  logic        [NUM_FORMATS-1:0][2*FP4VectorSize-1:0][FP4_MAN_BITS-1:0] fp4_fmt_mantissa;
+
+  fpnew_pkg::fp_info_t [NUM_FORMATS-1:0][2*FP4VectorSize-1:0] fp4_info_q;
+
+  // FP Input initialization (Src)
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_src_init_inputs
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (FpSrcFmtConfig[fmt]) begin : active_src_format
+      logic [2*VectorSize-1:0][FP_WIDTH-1:0] trimmed_ops;
+
+      // Classify input
+      fpnew_classifier #(
+        .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+        .NumOperands ( 2*VectorSize                 ),
+        .MX          ( 1                            )
+      ) i_fpnew_classifier (
+        .operands_i  ( trimmed_ops                                            ),
+        .is_boxed_i  ( inp_pipe_is_boxed_q[NumInpRegs][fmt][2*VectorSize-1:0] ),
+        .info_o      ( info_q[fmt][2*VectorSize-1:0]                          )
+      );
+      for (genvar op = 0; op < 2*VectorSize; op++) begin : gen_operands
+        assign trimmed_ops[op]       = operands_post_inp_pipe[op][FP_WIDTH-1:0];
+        assign fmt_sign[fmt][op]     = operands_post_inp_pipe[op][FP_WIDTH-1];
+        assign fmt_exponent[fmt][op] = signed'({1'b0, operands_post_inp_pipe[op][MAN_BITS+:EXP_BITS]});
+        assign fmt_mantissa[fmt][op] = operands_post_inp_pipe[op][MAN_BITS-1:0] <<
+                                       (SUPER_MAN_BITS - MAN_BITS); // move to left of mantissa
+      end
+    end else begin : inactive_src_format
+      assign info_q[fmt][2*VectorSize-1:0]  = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_sign[fmt]                  = fpnew_pkg::DONT_CARE;             // format disabled
+      assign fmt_exponent[fmt]              = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_mantissa[fmt]              = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+    end
+  end
+
+  if (FP6VectorSize != 0) begin : fp6_classifier
+    for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fp6_fmt_src_init_inputs
+      // Set up some constants
+      localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+      localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+      localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+      if (FpSrcFmtConfig[fmt]) begin : active_src_format
+        logic [2*FP6VectorSize-1:0][FP_WIDTH-1:0] trimmed_ops;
+
+        // Classify input
+        fpnew_classifier #(
+          .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+          .NumOperands ( 2*FP6VectorSize              ),
+          .MX          ( 1                            )
+        ) i_fpnew_classifier (
+          .operands_i  ( trimmed_ops                                               ),
+          .is_boxed_i  ( inp_pipe_is_boxed_q[NumInpRegs][fmt][2*FP6VectorSize-1:0] ),
+          .info_o      ( fp6_info_q[fmt][2*FP6VectorSize-1:0]                      )
+        );
+        for (genvar op = 0; op < 2*FP6VectorSize; op++) begin : gen_operands
+          assign trimmed_ops[op]           = fp6_operands_post_inp_pipe[op][FP_WIDTH-1:0];
+          assign fp6_fmt_sign[fmt][op]     = fp6_operands_post_inp_pipe[op][FP_WIDTH-1];
+          assign fp6_fmt_exponent[fmt][op] = fp6_operands_post_inp_pipe[op][MAN_BITS+:EXP_BITS];
+          assign fp6_fmt_mantissa[fmt][op] = fp6_operands_post_inp_pipe[op][MAN_BITS-1:0] <<
+                                        (SUPER_MAN_BITS - MAN_BITS); // move to left of mantissa
+        end
+      end else begin : inactive_src_format
+        assign fp6_info_q[fmt][2*FP6VectorSize-1:0] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+        assign fp6_fmt_sign[fmt]                    = fpnew_pkg::DONT_CARE;             // format disabled
+        assign fp6_fmt_exponent[fmt]                = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+        assign fp6_fmt_mantissa[fmt]                = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      end
+    end
+  end
+
+  if (FP4VectorSize != 0) begin : fp4_classifier
+    for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fp4_fmt_src_init_inputs
+      // Set up some constants
+      localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+      localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+      localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+      if (FpSrcFmtConfig[fmt]) begin : active_src_format
+        logic [2*FP4VectorSize-1:0][FP_WIDTH-1:0] trimmed_ops;
+
+        // Classify input
+        fpnew_classifier #(
+          .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+          .NumOperands ( 2*FP4VectorSize              ),
+          .MX          ( 1                            )
+        ) i_fpnew_classifier (
+          .operands_i  ( trimmed_ops                                               ),
+          .is_boxed_i  ( inp_pipe_is_boxed_q[NumInpRegs][fmt][2*FP4VectorSize-1:0] ),
+          .info_o      ( fp4_info_q[fmt][2*FP4VectorSize-1:0]                           )
+        );
+        for (genvar op = 0; op < 2*FP4VectorSize; op++) begin : gen_operands
+          assign trimmed_ops[op]           = fp4_operands_post_inp_pipe[op][FP_WIDTH-1:0];
+          assign fp4_fmt_sign[fmt][op]     = fp4_operands_post_inp_pipe[op][FP_WIDTH-1];
+          assign fp4_fmt_exponent[fmt][op] = fp4_operands_post_inp_pipe[op][MAN_BITS+:EXP_BITS];
+          assign fp4_fmt_mantissa[fmt][op] = fp4_operands_post_inp_pipe[op][MAN_BITS-1:0];
+        end
+      end else begin : inactive_src_format
+        assign fp4_info_q[fmt][2*FP4VectorSize-1:0] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+        assign fp4_fmt_sign[fmt]                    = fpnew_pkg::DONT_CARE;             // format disabled
+        assign fp4_fmt_exponent[fmt]                = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+        assign fp4_fmt_mantissa[fmt]                = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      end
+    end
+  end
+
+  // ----------------------------
+  // Destination operand
+  // ----------------------------
+  logic        [NUM_FORMATS-1:0]                         fmt_dst_sign;
+  logic signed [NUM_FORMATS-1:0][SUPER_DST_EXP_BITS-1:0] fmt_dst_exponent;
+  logic        [NUM_FORMATS-1:0][SUPER_DST_MAN_BITS-1:0] fmt_dst_mantissa;
+
+  // FP Input initialization (Src)
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : fmt_dst_init_inputs
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (FpDstFmtConfig[fmt]) begin : active_dst_format
+      logic [FP_WIDTH-1:0] trimmed_dst_ops;
+      logic                dst_ops_is_boxed;
+
+      assign dst_ops_is_boxed = inp_pipe_is_boxed_q[NumInpRegs][fmt][NUM_OPERANDS-1];
+
+      // Classify input
+      fpnew_classifier #(
+        .FpFormat    ( fpnew_pkg::fp_format_e'(fmt) ),
+        .NumOperands ( 1                            )
+      ) i_fpnew_classifier (
+        .operands_i  ( trimmed_dst_ops             ),
+        .is_boxed_i  ( dst_ops_is_boxed            ),
+        .info_o      ( info_q[fmt][NUM_OPERANDS-1] )
+      );
+      assign trimmed_dst_ops       = operand_d_q[FP_WIDTH-1:0];
+      assign fmt_dst_sign[fmt]     = operand_d_q[FP_WIDTH-1];
+      assign fmt_dst_exponent[fmt] = signed'({1'b0, operand_d_q[MAN_BITS+:EXP_BITS]});
+      assign fmt_dst_mantissa[fmt] = {info_q[fmt][NUM_OPERANDS-1].is_normal, operand_d_q[MAN_BITS-1:0]}
+                                         << (SUPER_DST_MAN_BITS - MAN_BITS);
+    end else begin : inactive_dst_format
+      assign info_q[fmt][NUM_OPERANDS-1] = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_dst_sign[fmt]           = fpnew_pkg::DONT_CARE;             // format disabled
+      assign fmt_dst_exponent[fmt]       = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+      assign fmt_dst_mantissa[fmt]       = '{default: fpnew_pkg::DONT_CARE}; // format disabled
+    end
+  end
+
+  // -------------------------------------------
+  // Operation selection and operand adjustment
+  // -------------------------------------------
+
+  always_comb begin : op_select
+    // Default assignments - packing-order-agnostic
+    if (src_is_int) begin : gen_int_default_assignments
+      // Integer operands
+      for (int i = 0; i < VectorSize; i++) begin : gen_default_assignments_int
+        operands_a[i] = operands_post_inp_pipe[i];
+        operands_b[i] = operands_post_inp_pipe[i+VectorSize];
+        // set to zero
+        info_a[i]     = fpnew_pkg::fp_info_t'(0);
+        info_b[i]     = fpnew_pkg::fp_info_t'(0);
+      end
+      for (int i = 0; i < FP6VectorSize; i++) begin : gen_default_assignments_fp6_int
+        // FP6
+        fp6_operands_a[i] = fp6_operands_post_inp_pipe[i];
+        fp6_operands_b[i] = fp6_operands_post_inp_pipe[i+FP6VectorSize];
+        // set to zero
+        fp6_info_a[i]     = fpnew_pkg::fp_info_t'(0);
+        fp6_info_b[i]     = fpnew_pkg::fp_info_t'(0);
+      end
+      for (int i = 0; i < FP4VectorSize; i++) begin : gen_default_assignments_fp4_int
+        // FP4
+        fp4_operands_a[i] = fp4_operands_post_inp_pipe[i];
+        fp4_operands_b[i] = fp4_operands_post_inp_pipe[i+FP4VectorSize];
+        // set to zero
+        fp4_info_a[i]     = fpnew_pkg::fp_info_t'(0);
+        fp4_info_b[i]     = fpnew_pkg::fp_info_t'(0);
+      end
+    end else begin : gen_fp_default_assignments
+      // Floating-point operands
+      for (int i = 0; i < VectorSize; i++) begin : gen_default_assignments_fp
+        operands_a[i] = {fmt_sign[src_fmt_q][i], fmt_exponent[src_fmt_q][i], fmt_mantissa[src_fmt_q][i]};
+        operands_b[i] = {fmt_sign[src_fmt_q][i+VectorSize], fmt_exponent[src_fmt_q][i+VectorSize], fmt_mantissa[src_fmt_q][i+VectorSize]};
+        info_a[i]     = info_q[src_fmt_q][i];
+        info_b[i]     = info_q[src_fmt_q][i+VectorSize];
+      end
+      for (int i = 0; i < FP6VectorSize; i++) begin : gen_default_assignments_fp6
+        // FP6
+        fp6_operands_a[i] = {fp6_fmt_sign[src_fmt_q][i], fp6_fmt_exponent[src_fmt_q][i], fp6_fmt_mantissa[src_fmt_q][i]};
+        fp6_operands_b[i] = {fp6_fmt_sign[src_fmt_q][i+FP6VectorSize], fp6_fmt_exponent[src_fmt_q][i+FP6VectorSize], fp6_fmt_mantissa[src_fmt_q][i+FP6VectorSize]};
+        fp6_info_a[i]     = fp6_info_q[src_fmt_q][i];
+        fp6_info_b[i]     = fp6_info_q[src_fmt_q][i+FP6VectorSize];
+      end
+      for (int i = 0; i < FP4VectorSize; i++) begin : gen_default_assignments_fp4
+        // FP4
+        fp4_operands_a[i] = {fp4_fmt_sign[src_fmt_q][i], fp4_fmt_exponent[src_fmt_q][i], fp4_fmt_mantissa[src_fmt_q][i]};
+        fp4_operands_b[i] = {fp4_fmt_sign[src_fmt_q][i+FP4VectorSize], fp4_fmt_exponent[src_fmt_q][i+FP4VectorSize], fp4_fmt_mantissa[src_fmt_q][i+FP4VectorSize]};
+        fp4_info_a[i]     = fp4_info_q[src_fmt_q][i];
+        fp4_info_b[i]     = fp4_info_q[src_fmt_q][i+FP4VectorSize];
+      end
+    end
+    for (int i = 0; i < 2; i++) begin : gen_default_assignments_c
+      operands_c[i] = signed'(operands_c_q[i]) - 127; // signed scale, 127 = signed'(2**(SCALE_WIDTH-1)-1)
+      info_c[i] = '{is_normal: 1'b1, is_nan: operands_c_q[i] == 2**SCALE_WIDTH-1, is_boxed: 1'b1, default: 1'b0}; // normal, boxed value, scale can be NaN
+    end
+    operand_d = {fmt_dst_sign[dst_fmt_q], fmt_dst_exponent[dst_fmt_q], fmt_dst_mantissa[dst_fmt_q]};
+    info_d    = info_q[dst_fmt_q][NUM_OPERANDS-1];
+  end
+endmodule
+
+// Detects special cases (NaN, infinity, invalid operations like 0×inf) and generates canonical results.
+// Only FP8 sources can have inf/nan; FP6 and FP4 have limited exponent ranges.
+module fpnew_mxdotp_special_cases
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter fpnew_pkg::fmt_logic_t FpDstFmtConfig = MxdotpDstFpFmtConfig
+) (
+  // Input signals
+  input  fp_src_t [VectorSize-1:0]             operands_a,
+  input  fp_src_t [VectorSize-1:0]             operands_b,
+  input  logic signed [1:0][SCALE_WIDTH-1:0]   operands_c,
+  input  fp_dst_t                              operand_d,
+  input  fpnew_pkg::fp_info_t [VectorSize-1:0] info_a,
+  input  fpnew_pkg::fp_info_t [VectorSize-1:0] info_b,
+  input  fpnew_pkg::fp_info_t [1:0]            info_c,
+  input  fpnew_pkg::fp_info_t                  info_d,
+  input fpnew_pkg::fp_format_e                 dst_fmt_q,
+  // Output signals: special_result, special_status, result_is_special
+  output logic [DST_WIDTH-1:0]                 special_result,
+  output fpnew_pkg::status_t                   special_status,
+  output logic                                 result_is_special
+);
+
+  // ---------------------
+  // Input classification
+  // ---------------------
+  logic any_operand_inf;
+  logic any_operand_nan;
+  logic signalling_nan;
+  logic any_produced_nan;
+  logic any_pos_inf;
+  logic any_neg_inf;
+
+  // Intermediate signals for each condition
+  logic [VectorSize-1:0] operand_inf_conditions;
+  logic [VectorSize-1:0] operand_nan_conditions;
+  logic [VectorSize-1:0] signalling_nan_conditions;
+  logic [VectorSize-1:0] nan_conditions;
+  logic [VectorSize-1:0] pos_inf_conditions;
+  logic [VectorSize-1:0] neg_inf_conditions;
+
+  // Single generate block for all conditions
+  generate
+    for (genvar i = 0; i < VectorSize; i = i + 1) begin : gen_conditions
+      // Check if any operand is infinite
+      assign operand_inf_conditions[i] = info_a[i].is_inf || info_b[i].is_inf;
+
+      // Check if any operand is NaN
+      assign operand_nan_conditions[i] = info_a[i].is_nan || info_b[i].is_nan;
+
+      // Check for signalling NaN
+      assign signalling_nan_conditions[i] = info_a[i].is_signalling || info_b[i].is_signalling;
+
+      // Check for produced NaN (0 * inf or inf * 0)
+      assign nan_conditions[i] = (info_a[i].is_inf && info_b[i].is_zero) ||
+                                  (info_b[i].is_inf && info_a[i].is_zero);
+
+      // Check for positive infinity (inf with same sign)
+      assign pos_inf_conditions[i] = (info_a[i].is_inf && ~(operands_a[i].sign ^ operands_b[i].sign)) ||
+                                      (info_b[i].is_inf && ~(operands_a[i].sign ^ operands_b[i].sign));
+
+      // Check for negative infinity (inf with opposite sign)
+      assign neg_inf_conditions[i] = (info_a[i].is_inf && (operands_a[i].sign ^ operands_b[i].sign)) ||
+                                      (info_b[i].is_inf && (operands_a[i].sign ^ operands_b[i].sign));
+    end
+  endgenerate
+
+  // Reduction for final results
+  assign any_operand_inf = |operand_inf_conditions || info_d.is_inf;
+  assign any_operand_nan = |operand_nan_conditions || info_c[0].is_nan || info_c[1].is_nan || info_d.is_nan;
+  assign signalling_nan  = |signalling_nan_conditions || info_c[0].is_signalling || info_c[1].is_signalling || info_d.is_signalling;
+  assign any_produced_nan = |nan_conditions;
+  assign any_pos_inf = |pos_inf_conditions || (info_d.is_inf && ~operand_d.sign);
+  assign any_neg_inf = |neg_inf_conditions || (info_d.is_inf && operand_d.sign);
+
+  // ----------------------
+  // Special case handling
+  // ----------------------
+  logic               [NUM_FORMATS-1:0][DST_WIDTH-1:0] fmt_special_result;
+  fpnew_pkg::status_t [NUM_FORMATS-1:0]                fmt_special_status;
+  logic               [NUM_FORMATS-1:0]                fmt_result_is_special;
+
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_special_results
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = '1;
+    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    localparam logic [MAN_BITS-1:0] ZERO_MANTISSA = '0;
+
+    if (FpDstFmtConfig[fmt]) begin : active_format
+      always_comb begin : special_cases
+        logic [FP_WIDTH-1:0] special_res;
+
+        // Default assignment
+        special_res                = {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+        fmt_special_status[fmt]    = '0;
+        fmt_result_is_special[fmt] = 1'b0;
+
+        // Handle potentially mixed nan & infinity input => important for the case where infinity and
+        // zero are multiplied and added to a qNaN.
+        // RISC-V mandates raising the NV exception in these cases:
+        // (inf * 0) + c or (0 * inf) + c INVALID, no matter c (even quiet NaNs)
+        if (any_produced_nan) begin
+          fmt_result_is_special[fmt] = 1'b1; // bypass OP, output is the canonical qNaN
+          fmt_special_status[fmt].NV = 1'b1; // invalid operation
+        // NaN Inputs cause canonical quiet NaN at the output and maybe invalid OP
+        end else if (any_operand_nan) begin
+          fmt_result_is_special[fmt] = 1'b1;           // bypass OP, output is the canonical qNaN
+          fmt_special_status[fmt].NV = signalling_nan; // raise the invalid operation flag if signalling
+        // Special cases involving infinity
+        end else if (any_operand_inf) begin
+          fmt_result_is_special[fmt] = 1'b1; // bypass OP
+          // Effective addition of opposite infinities (±inf - ±inf) is invalid!
+          if (any_pos_inf && any_neg_inf) begin
+            fmt_special_status[fmt].NV = 1'b1; // invalid operation
+          // Handle cases where output will be inf because of inf product input
+          end else if (any_pos_inf) begin
+            // Result is infinity with the positive sign
+            special_res = {1'b0, QNAN_EXPONENT, ZERO_MANTISSA};
+          // Handle cases where the second product is inf
+          end else if (any_neg_inf) begin
+            // Result is infinity with the negative sign
+            special_res = {1'b1, QNAN_EXPONENT, ZERO_MANTISSA};
+          end
+        end
+        // Initialize special result with ones (NaN-box)
+        fmt_special_result[fmt]               = '1;
+        fmt_special_result[fmt][FP_WIDTH-1:0] = special_res;
+      end
+    end else begin : inactive_format
+      assign fmt_special_result[fmt] = '{default: fpnew_pkg::DONT_CARE};
+      assign fmt_special_status[fmt] = '0;
+      assign fmt_result_is_special[fmt] = 1'b0;
+    end
+  end
+
+  // Detect special case from source format
+  assign result_is_special = fmt_result_is_special[dst_fmt_q];
+  // Signalling input NaNs raise invalid flag, otherwise no flags set
+  assign special_status = fmt_special_status[dst_fmt_q];
+  // Assemble result according to destination format
+  assign special_result = fmt_special_result[dst_fmt_q];
+endmodule
+
+// Adds two signed 8-bit scale values to produce a 9-bit combined scale.
+module fpnew_mxdotp_scale_adder
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic signed [1:0][SCALE_WIDTH-1:0] operands_c,
+  output logic signed [SCALE_WIDTH:0] scale // +1 for addition
+);
+  // ------------------
+  // Scale data path
+  // ------------------
+  assign scale = signed'(operands_c[0]) + signed'(operands_c[1]);
+endmodule
+
+// Multiplies two vectors of mantissas (with implicit bit prepended) element-wise, applying sign logic.
+// Produces signed products (2p+1 bits) based on XOR of input signs.
+module fpnew_mxdotp_vector_multiplier
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter type         SrcType         = logic,
+  parameter int unsigned LocalVectorSize = 8,
+  parameter int unsigned PrecisionBits   = 4
+) (
+  // Input signals
+  input  SrcType [LocalVectorSize-1:0] operands_a,
+  input  SrcType [LocalVectorSize-1:0] operands_b,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_a,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_b,
+  output logic signed [LocalVectorSize-1:0][2*PrecisionBits :0] product_signed
+);
+  // ------------------
+  // Product data path
+  // ------------------
+  logic [LocalVectorSize-1:0][  PrecisionBits-1:0] mantissa_a, mantissa_b;
+  logic [LocalVectorSize-1:0][2*PrecisionBits-1:0] product;  // the p*p product is 2p-bit wide
+
+  // Add implicit bits to mantissae
+  for (genvar i = 0; i < LocalVectorSize; i++) begin : gen_mantissa
+    assign mantissa_a[i] = {info_a[i].is_normal, operands_a[i].mantissa};
+    assign mantissa_b[i] = {info_b[i].is_normal, operands_b[i].mantissa};
+    assign product[i]    = mantissa_a[i] * mantissa_b[i];
+    assign product_signed[i] = (operands_a[i].sign ^ operands_b[i].sign) ? -product[i] : product[i];
+  end
+endmodule
+
+// Multiplies vectors of signed integers (INT8) or floating-point mantissas (FP8) with sign handling.
+// For FP8: adds implicit bit and applies sign via negation. For INT8: uses full 8-bit signed values.
+module fpnew_mxdotp_signed_vector_multiplier
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter type         SrcType         = logic,
+  parameter int unsigned LocalVectorSize = 8,
+  parameter int unsigned PrecisionBits   = 8
+) (
+  // Input signals
+  input  SrcType [LocalVectorSize-1:0] operands_a,
+  input  SrcType [LocalVectorSize-1:0] operands_b,
+  input  fpnew_pkg::fp_format_e  src_fmt_q,
+  input  fpnew_pkg::int_format_e int_fmt_q,
+  input  logic src_is_int,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_a,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_b,
+  output logic signed [LocalVectorSize-1:0][2*PrecisionBits-1:0] product_signed
+);
+  // ------------------
+  // Product data path
+  // ------------------
+  logic signed [LocalVectorSize-1:0][  PrecisionBits-1:0] mantissa_a, mantissa_b;
+
+  for (genvar i = 0; i < LocalVectorSize; i++) begin : gen_mantissa_fp8
+    always_comb begin
+      if (src_is_int && int_fmt_q == fpnew_pkg::INT8) begin : int8
+        // For INT8, we use the full 8-bit mantissa
+        mantissa_a[i] = operands_a[i][7:0];
+        mantissa_b[i] = operands_b[i][7:0];
+      end else begin : fp8
+        // Add implicit bits to mantissae and pad with zeros
+        mantissa_a[i] = {4'b0, info_a[i].is_normal, operands_a[i].mantissa};
+        mantissa_b[i] = {4'b0, info_b[i].is_normal, operands_b[i].mantissa};
+        if (operands_a[i].sign ^ operands_b[i].sign) begin
+          // If the signs are different, we need to negate one mantissa
+          mantissa_a[i] = -signed'(mantissa_a[i]);
+        end
+      end
+    end
+  end
+
+  for (genvar i = 0; i < LocalVectorSize; i++) begin : gen_mantissa
+    assign product_signed[i] = signed'(mantissa_a[i]) * signed'(mantissa_b[i]);
+  end
+endmodule
+
+// Shifts products left by (exp_a + exp_b - 2×bias + SOP_SHIFT) to align to fixed-point anchor.
+// Handles FP8/FP6/FP4 with format-specific offsets; INT8 shifts directly to anchor position.
+module fpnew_mxdotp_product_shifter
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter type         SrcType          = logic,
+  parameter int unsigned LocalVectorSize  = 8,
+  parameter fpnew_pkg::fp_format_e SrcFmt = fpnew_pkg::FP8,
+  parameter int unsigned ProductBits      = 4,
+  parameter int unsigned ExpWidth         = 8,
+  parameter int unsigned OutputWidth      = 70
+) (
+  // Input signals
+  input  SrcType [LocalVectorSize-1:0] operands_a,
+  input  SrcType [LocalVectorSize-1:0] operands_b,
+  input  logic [LocalVectorSize-1:0][ProductBits-1:0] product_signed,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_a,
+  input  fpnew_pkg::fp_info_t [LocalVectorSize-1:0] info_b,
+  input  fpnew_pkg::fp_format_e src_fmt_q,
+  input  fpnew_pkg::int_format_e int_fmt_q,
+  input  logic src_is_int,
+  output logic signed [LocalVectorSize-1:0][OutputWidth-1:0] shifted_product
+);
+  // ------------------
+  // Shift data path
+  // ------------------
+  logic signed [LocalVectorSize-1:0][ExpWidth-1:0] exponent_product;
+
+  // Calculate the non-biased exponent of the product
+  for (genvar i = 0; i < LocalVectorSize; i++) begin : gen_exponent_adjustment
+    assign exponent_product[i] = operands_a[i].exponent + info_a[i].is_subnormal
+                                + operands_b[i].exponent + info_b[i].is_subnormal
+                                - 2*signed'(bias_constant(src_fmt_q));
+    if (SrcFmt == fpnew_pkg::FP8) begin
+      always_comb begin // TODO: Generate only for INT8 vs FP8
+        if (src_is_int && int_fmt_q == fpnew_pkg::INT8) begin
+          // INT8: shift to integer position
+          shifted_product[i] = signed'(product_signed[i]) << ANCHOR;
+        end else begin
+          // Right shift the significand by anchor point - exponent
+          // sum of four 9-bit numbers can be at most 11 bits, for 69 bits output we need to shift by 69 - 11 = 58
+          // 58-30=28 plus inherit 6 fractional bits from the multiplication -> point moves to 28+6=34
+          // max shift can be 58 (28 + exp-max(30)), min shift is 0 (28 + exp-min(-28))
+          shifted_product[i] = signed'(product_signed[i]) << (signed'(SOP_SHIFT) + signed'(exponent_product[i]));
+        end
+      end
+    end else if (SrcFmt == fpnew_pkg::FP6) begin
+      // E3 exponent_product is in range [-4, 8], requires 5b for signed representation
+      // To make shift positive, we scale by 4
+      assign shifted_product[i] = signed'(product_signed[i]) << (signed'(4) + signed'(exponent_product[i]));
+    end else begin
+      // exponent_product is negative only for zero inputs for FP4
+      assign shifted_product[i] = signed'(product_signed[i]) << exponent_product[i];
+    end
+  end
+endmodule
+
+// Sums all shifted products in the vector.
+module fpnew_mxdotp_adder_tree
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter int unsigned LocalVectorSize = 8,
+  parameter int unsigned InputWidth      = 4,
+  parameter int unsigned OutputWidth     = 70
+) (
+  // Input signals
+  input  logic signed [LocalVectorSize-1:0][InputWidth-1:0] shifted_product,
+  output logic signed [OutputWidth-1:0] sum_product
+);
+  // ------------------
+  // Adder data path
+  // ------------------
+  // Sum the products
+  always_comb begin : sum_products
+    sum_product = '0;
+    for (int i = 0; i < LocalVectorSize; i++) begin : gen_sum_products
+      sum_product += signed'(shifted_product[i]);
+    end
+  end
+endmodule
+
+// Adds FP8, FP6, and FP4 sum-of-products; shifts FP6 and FP4 sums to align before adding.
+// When FP6 is disabled, sum_product_fp6 is zero and optimized away by synthesis.
+module fpnew_mxdotp_format_adder
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter int unsigned Fp6SumWidth = FP6_PROD_SHIFT_WIDTH,
+  parameter int unsigned Fp4SumWidth = FP4_PROD_SHIFT_WIDTH
+) (
+  input  logic signed [SOP_FIXED_WIDTH-1:0] sum_product_fp8,
+  input  logic signed [Fp6SumWidth-1:0]     sum_product_fp6,
+  input  logic signed [Fp4SumWidth-1:0]     sum_product_fp4,
+  output logic signed [FIXED_SUM_WIDTH-1:0] sum_product
+);
+  // ------------------
+  // Adder data path
+  // ------------------
+  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_fp4_shifted;
+  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_fp6_shifted;
+
+  assign sum_product_fp4_shifted = signed'(sum_product_fp4) << (SOP_SHIFT+2*(SUPER_MAN_BITS-FP4_MAN_BITS));
+  assign sum_product_fp6_shifted = signed'(sum_product_fp6) << (SOP_SHIFT-4+2*(SUPER_MAN_BITS-FP6_MAN_BITS)); // 4 is subtracted to account for the 4-bit shift in the product shifter
+  assign sum_product = sum_product_fp8 + sum_product_fp4_shifted + sum_product_fp6_shifted;
+endmodule
+
+// Shifts accumulator right to align with sum-of-products based on scale and accumulator exponent.
+// Computes shift amount, handles sticky bits, and detects if accumulator dominates the result.
+module fpnew_mxdotp_accumulator_shift
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_q,
+  input  logic [SCALE_WIDTH:0] scale_q2,
+  input  fp_dst_t operand_d_q2,
+  input  fpnew_pkg::fp_info_t info_d_q,
+  input  fpnew_pkg::fp_format_e dst_fmt_q2,
+  output logic result_is_accumulator,
+  output logic accumulator_is_right_shifted,
+  output logic signed [9:0] accumulator_right_shift_amount,
+  output logic signed [DST_PRECISION_BITS-1:0] accumulator_remaining,
+  output logic accumulator_sticky,
+  output logic signed [DST_PRECISION_BITS :0] signed_mantissa_d,
+  output logic signed [FIXED_SUM_WIDTH-1:0] accumulator_shifted
+);
+
+  // -----------------------------
+  // Accumulator shift data path
+  // -----------------------------
+  logic signed [9:0] accumulator_shift_amount;
+  logic signed [DST_EXP_WIDTH-1:0] exponent_d;
+  logic [DST_PRECISION_BITS-1:0] mantissa_d;
+
+  // Zero-extend exponents into signed container - implicit width extension
+  assign exponent_d = {1'b0, operand_d_q2.exponent};
+  assign mantissa_d = {info_d_q.is_normal, operand_d_q2.mantissa};
+  assign signed_mantissa_d = operand_d_q2.sign ? -mantissa_d : mantissa_d;
+
+  // Calculate the shift amount for the accumulator, range=[-370,394-9b -> signed 10b]
+  assign accumulator_shift_amount = signed'(ANCHOR - SUPER_DST_MAN_BITS) - signed'(scale_q2)
+                                     + signed'(exponent_d + info_d_q.is_subnormal)
+                                     - signed'(bias_constant(dst_fmt_q2));
+
+  always_comb begin : accumulator_shift
+    result_is_accumulator = 1'b0;
+    accumulator_is_right_shifted = 1'b0;
+    accumulator_right_shift_amount = '0;
+    accumulator_remaining = '0;
+    accumulator_sticky = 1'b0;
+    if (accumulator_shift_amount > MAX_ACC_SHIFT_AMOUNT) begin
+      // SoP is too small to change the accumulator, result is the accumulator
+      accumulator_shifted = '0;
+      result_is_accumulator = 1'b1;
+    end else if (accumulator_shift_amount >= 0) begin
+      accumulator_shifted = signed'(signed_mantissa_d) <<< accumulator_shift_amount;
+    end else begin
+      accumulator_is_right_shifted = 1'b1;
+      accumulator_right_shift_amount = -accumulator_shift_amount;
+      accumulator_shifted = signed'(signed_mantissa_d) >>> accumulator_right_shift_amount;
+      if (accumulator_right_shift_amount > DST_PRECISION_BITS) begin
+        result_is_accumulator = (sum_product_q == '0) ? 1'b1 : 1'b0;
+        accumulator_remaining = signed'(signed_mantissa_d) >>> (accumulator_right_shift_amount - DST_PRECISION_BITS);
+        accumulator_sticky = |(signed'(signed_mantissa_d) & ((1 << (accumulator_right_shift_amount - DST_PRECISION_BITS)) - 1));
+      end else begin
+        accumulator_remaining = signed'(signed_mantissa_d) << (DST_PRECISION_BITS - accumulator_right_shift_amount);
+        accumulator_sticky = 1'b0;
+      end
+    end
+  end
+endmodule
+
+// Adds aligned accumulator to sum-of-products, extending with accumulator remainder bits.
+module fpnew_mxdotp_add_accumulator_sop
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_q,
+  input  logic signed [FIXED_SUM_WIDTH-1:0] accumulator_shifted,
+  input  logic signed [DST_PRECISION_BITS-1:0] accumulator_remaining,
+  output logic signed [LZC_SUM_WIDTH-1:0] sum_product_accumulator_extended
+);
+
+  logic signed [FIXED_SUM_WIDTH-1:0] sum_product_accumulator;
+
+  assign sum_product_accumulator = sum_product_q + accumulator_shifted;
+  assign sum_product_accumulator_extended = {sum_product_accumulator, accumulator_remaining};
+endmodule
+
+// Converts results to sign-magnitude format using two's complement.
+module fpnew_mxdotp_twos_compl
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic [LZC_SUM_WIDTH-1:0] sum_product_accumulator_extended,
+  input  logic signed [DST_PRECISION_BITS :0] signed_mantissa_d,
+  input  logic accumulator_is_right_shifted,
+  input  logic signed [9:0] accumulator_right_shift_amount,
+  input  logic final_sign,
+  // Output signals
+  output logic [LZC_SUM_WIDTH-1:0] sum_magnitude
+);
+  // ------------------
+  // Two's complement
+  // ------------------
+
+  always_comb begin : get_twos_complement
+    if (final_sign) begin
+      sum_magnitude = ~sum_product_accumulator_extended + 1;
+      if (accumulator_is_right_shifted && accumulator_right_shift_amount > DST_PRECISION_BITS && signed_mantissa_d != 0) begin
+        sum_magnitude = ~sum_product_accumulator_extended;
+      end
+    end else begin
+      sum_magnitude = sum_product_accumulator_extended;
+    end
+  end
+endmodule
+
+// Shifts magnitude left by normalization amount to align leading 1 to implicit bit position.
+module fpnew_mxdotp_norm_shift
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic [LZC_SUM_WIDTH-1:0] sum_magnitude,
+  input  logic [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt,
+  // Output signals
+  output logic [LZC_SUM_WIDTH-1:0] sum_shifted
+);
+  // ------------------
+  // Normalization shift
+  // ------------------
+
+  // Shift the sum to normalize it
+  assign sum_shifted = sum_magnitude << norm_shamt;
+endmodule
+
+// Normalizes fixed-point sum to floating-point: computes LZC, determines shift amount, calculates
+// biased exponent (127 - anchor + scale + shift), and extracts mantissa. Handles subnormals.
+module fpnew_mxdotp_normalizer
+  import fpnew_mxdotp_multi_pkg::*;
+(
+  // Input signals
+  input  logic signed [LZC_SUM_WIDTH-1:0] sum_product_accumulator_extended,
+  input  logic accumulator_is_right_shifted,
+  input  logic signed [9:0] accumulator_right_shift_amount,
+  input  logic signed [DST_PRECISION_BITS :0] signed_mantissa_d,
+  input  logic accumulator_sticky,
+  input  logic [SCALE_WIDTH:0] scale_q2,
+  input  fpnew_pkg::fp_format_e dst_fmt_q2,
+  // Output signals
+  output logic final_sign,
+  output logic signed [DST_EXP_WIDTH-1:0] final_exponent,
+  output logic [DST_PRECISION_BITS-1:0] final_mantissa,
+  output logic sticky_after_norm,
+  output logic        [LZC_SUM_WIDTH-1:0]  sum_magnitude
+);
+
+  // --------------
+  // Normalization
+  // --------------
+  logic        [LZC_SUM_WIDTH-1:0]  sum_shifted;
+  logic        [LZC_RESULT_WIDTH-1:0] leading_zero_count;     // the number of leading zeroes
+  logic signed [LZC_RESULT_WIDTH:0]   leading_zero_count_sgn; // signed leading-zero count
+  logic                               lzc_zeroes;             // in case only zeroes found
+
+  logic signed [DST_EXP_WIDTH-1:0]      final_tentative_exponent;
+
+  logic        [SHIFT_AMOUNT_WIDTH-1:0] norm_shamt; // Normalization shift amount
+  logic signed [DST_EXP_WIDTH-1:0]      normalized_exponent;
+
+  logic        [LZC_SUM_WIDTH-DST_PRECISION_BITS-1:0] sum_sticky_bits;
+
+  // Leading sign counter
+  // If sum is negative, complement to feed into leading zero counter
+  assign final_sign    = sum_product_accumulator_extended[LZC_SUM_WIDTH-1];
+
+  fpnew_mxdotp_twos_compl #(
+  ) i_twos_compl (
+    .sum_product_accumulator_extended ( sum_product_accumulator_extended ),
+    .final_sign                      ( final_sign                      ),
+    .signed_mantissa_d               ( signed_mantissa_d               ),
+    .accumulator_is_right_shifted    ( accumulator_is_right_shifted    ),
+    .accumulator_right_shift_amount  ( accumulator_right_shift_amount  ),
+    .sum_magnitude( sum_magnitude                  )
+  );
+
+  // Leading sign counter
+  lzc #(
+    .WIDTH ( LZC_SUM_WIDTH ),
+    .MODE  ( 1               ) // MODE = 1 counts leading zeroes
+  ) i_lzc (
+    .in_i    ( sum_magnitude      ),
+    .cnt_o   ( leading_zero_count ),
+    .empty_o ( lzc_zeroes         )
+  );
+
+  assign leading_zero_count_sgn = signed'({1'b0, leading_zero_count});
+
+  // Calculate the biased exponent (excess-127 form)
+  // The exponent-major is -scaled_anchor
+  // exponent = 127 - scaled_anchor + (94-count-1) + increment_exponent [-195, 315 9b -> 10b signed]
+  assign final_tentative_exponent = 127 - (signed'(ANCHOR)-signed'(scale_q2)) + (signed'(FIXED_SUM_WIDTH) - leading_zero_count_sgn - 1); // 127 = signed'(fpnew_pkg::bias(dst_fmt_q2))
+
+  // Normalization shift amount based on exponents and LZC (unsigned as only left shifts)
+  always_comb begin : norm_shift_amount
+    // Subnormals
+    if (final_tentative_exponent > 0 && !lzc_zeroes) begin
+      norm_shamt          = leading_zero_count_sgn + 1;
+      normalized_exponent = final_tentative_exponent;
+    end else begin // Subnormals and zero
+      norm_shamt          = leading_zero_count_sgn + final_tentative_exponent;
+      normalized_exponent = '0; // subnormals encoded as 0
+    end
+  end
+
+  fpnew_mxdotp_norm_shift #(
+  ) i_norm_shift (
+    .sum_shifted          ( sum_shifted          ),
+    .sum_magnitude        ( sum_magnitude        ),
+    .norm_shamt           ( norm_shamt           )
+  );
+
+  // LSB of final mantissa is the rounding bit
+  assign {final_mantissa, sum_sticky_bits} = sum_shifted;
+  assign final_exponent                    = normalized_exponent;
+  assign sticky_after_norm                 = (|sum_sticky_bits) | accumulator_sticky;
+endmodule
+
+// Rounds normalized result to destination format with IEEE rounding modes (RNE/RTZ/RDN/RUP/RMM).
+// Detects overflow/underflow before and after rounding, generates round/sticky bits.
+module fpnew_mxdotp_rounder
+  import fpnew_mxdotp_multi_pkg::*;
+#(
+  parameter fpnew_pkg::fmt_logic_t FpDstFmtConfig = MxdotpDstFpFmtConfig
+) (
+  // Input signals
+  input  logic clk_i,
+  input  logic rst_ni,
+  input  logic final_sign,
+  input  logic [DST_EXP_WIDTH-1:0] final_exponent,
+  input  logic [DST_PRECISION_BITS-1:0] final_mantissa,
+  input  logic [LZC_SUM_WIDTH-1:0] sum_magnitude,
+  input  logic sticky_after_norm,
+  input fpnew_pkg::fp_format_e dst_fmt_q2,
+  input fpnew_pkg::roundmode_e rnd_mode_q,
+  // Output signals
+  output logic [NUM_FORMATS-1:0][DST_WIDTH-1:0] fmt_result,
+  output logic [1:0] round_sticky_bits,
+  output logic of_before_round,
+  output logic of_after_round,
+  output logic uf_after_round
+);
+
+  // ----------------------------
+  // Rounding and classification
+  // ----------------------------
+  logic                                             pre_round_sign;
+  logic [SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] pre_round_abs; // absolute value of result before rounding
+
+  logic [NUM_FORMATS-1:0][SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] fmt_pre_round_abs; // per format
+  logic [NUM_FORMATS-1:0][1:0]                                       fmt_round_sticky_bits;
+
+  logic [NUM_FORMATS-1:0]                           fmt_of_after_round;
+  logic [NUM_FORMATS-1:0]                           fmt_uf_after_round;
+
+  logic                                             rounded_sign;
+  logic [SUPER_DST_EXP_BITS+SUPER_DST_MAN_BITS-1:0] rounded_abs; // absolute value of result after rounding
+  logic                                             result_zero;
+
+  // Classification before round. RISC-V mandates checking underflow AFTER rounding
+  assign of_before_round = final_exponent >= 2**(fpnew_pkg::exp_bits(dst_fmt_q2))-1; // infinity exponent is all ones
+
+  // Pack exponent and mantissa into proper rounding form
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_res_assemble
+    // Set up some constants
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned ALL_EXTRA_BITS = fpnew_pkg::maximum(SUPER_DST_MAN_BITS-MAN_BITS+1+DST_PRECISION_BITS+PRECISION_BITS+2+1, 1);
+
+    logic [EXP_BITS-1:0] pre_round_exponent;
+    logic [MAN_BITS-1:0] pre_round_mantissa;
+
+    if (FpDstFmtConfig[fmt]) begin : active_dst_format
+
+      assign pre_round_exponent = (of_before_round) ? 2**EXP_BITS-2 : final_exponent[EXP_BITS-1:0];
+      assign pre_round_mantissa = (of_before_round) ? '1 : final_mantissa[SUPER_DST_MAN_BITS-:MAN_BITS];
+      // Assemble result before rounding. In case of overflow, the largest normal value is set.
+      assign fmt_pre_round_abs[fmt] = {pre_round_exponent, pre_round_mantissa}; // 0-extend
+
+      // Round bit is after mantissa (1 in case of overflow for rounding)
+      assign fmt_round_sticky_bits[fmt][1] = final_mantissa[SUPER_DST_MAN_BITS-MAN_BITS] |
+                                             of_before_round;
+
+      // remaining bits in mantissa to sticky (1 in case of overflow for rounding)
+      if (MAN_BITS < SUPER_DST_MAN_BITS) begin : narrow_sticky
+        assign fmt_round_sticky_bits[fmt][0] = (| final_mantissa[SUPER_DST_MAN_BITS-MAN_BITS-1:0]) |
+                                               sticky_after_norm | of_before_round;
+      end else begin : normal_sticky
+        assign fmt_round_sticky_bits[fmt][0] = sticky_after_norm | of_before_round;
+      end
+    end else begin : inactive_format
+      assign fmt_pre_round_abs[fmt] = '{default: fpnew_pkg::DONT_CARE};
+      assign fmt_round_sticky_bits[fmt] = '{default: fpnew_pkg::DONT_CARE};
+    end
+  end
+
+  // Assemble result before rounding. In case of overflow, the largest normal value is set.
+  assign pre_round_abs      = fmt_pre_round_abs[dst_fmt_q2];
+
+  // In case of overflow, the round and sticky bits are set for proper rounding
+  assign round_sticky_bits  = fmt_round_sticky_bits[dst_fmt_q2];
+  assign pre_round_sign     = final_sign;
+
+  // Perform the rounding
+  fpnew_rounding #(
+    .AbsWidth     ( SUPER_DST_EXP_BITS + SUPER_DST_MAN_BITS )
+  ) i_fpnew_rounding (
+    .clk_i                      ( clk_i                    ),
+    .rst_ni                     ( rst_ni                   ),
+    .id_i                       ( '0                       ),
+    .abs_value_i                ( pre_round_abs            ),
+    .en_rsr_i                   ( 1'b0                     ),
+    .sign_i                     ( pre_round_sign           ),
+    .round_sticky_bits_i        ( round_sticky_bits        ),
+    .stochastic_rounding_bits_i ( '0                       ),
+    .rnd_mode_i                 ( rnd_mode_q               ),
+    .effective_subtraction_i    ( 1'b0 ), // Effective subtraction is not implemented as RNE is used
+    .abs_rounded_o              ( rounded_abs              ),
+    .sign_o                     ( rounded_sign             ),
+    .exact_zero_o               ( result_zero              )
+  );
+
+
+  for (genvar fmt = 0; fmt < int'(NUM_FORMATS); fmt++) begin : gen_sign_inject
+    // Set up some constants
+    localparam int unsigned FP_WIDTH = fpnew_pkg::fp_width(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned EXP_BITS = fpnew_pkg::exp_bits(fpnew_pkg::fp_format_e'(fmt));
+    localparam int unsigned MAN_BITS = fpnew_pkg::man_bits(fpnew_pkg::fp_format_e'(fmt));
+
+    if (FpDstFmtConfig[fmt]) begin : active_dst_format
+      always_comb begin : post_process
+        // detect of / uf
+        fmt_uf_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '0; // denormal
+        fmt_of_after_round[fmt] = rounded_abs[EXP_BITS+MAN_BITS-1:MAN_BITS] == '1; // inf exp.
+
+        // Assemble regular result, nan box short ones.
+        fmt_result[fmt]               = '1;
+        fmt_result[fmt][FP_WIDTH-1:0] = {rounded_sign, rounded_abs[EXP_BITS+MAN_BITS-1:0]};
+      end
+    end else begin : inactive_format
+      assign fmt_uf_after_round[fmt] = fpnew_pkg::DONT_CARE;
+      assign fmt_of_after_round[fmt] = fpnew_pkg::DONT_CARE;
+      assign fmt_result[fmt]         = '{default: fpnew_pkg::DONT_CARE};
+    end
+  end
+
+  // Classification after rounding select by destination format
+  assign uf_after_round = fmt_uf_after_round[dst_fmt_q2];
+  assign of_after_round = fmt_of_after_round[dst_fmt_q2];
+endmodule
diff --git a/src/mxdotp/fpnew_mxdotp_multi_pkg.sv b/src/mxdotp/fpnew_mxdotp_multi_pkg.sv
new file mode 100644
index 00000000..c3ce3404
--- /dev/null
+++ b/src/mxdotp/fpnew_mxdotp_multi_pkg.sv
@@ -0,0 +1,148 @@
+// Copyright 2025 ETH Zurich and University of Bologna.
+//
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License. You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// SPDX-License-Identifier: SHL-0.51
+
+// Author: Gamze Islamoglu <gislamoglu@iis.ee.ethz.ch>
+
+package fpnew_mxdotp_multi_pkg;
+  // Configuration
+  // One-hot config string: | FP32 | FP64 | FP16 | FP8 | FP16ALT | FP8ALT | FP6 | FP6ALT | FP4
+
+  // Default format configuration (all MX formats enabled)
+  // These define the maximum-width types and serve as defaults when not overridden by module parameters.
+  localparam fpnew_pkg::fmt_logic_t   MxdotpSrcFpFmtConfig  = 9'b000101111; // FP8, FP8ALT, FP6, FP6ALT, FP4
+  localparam fpnew_pkg::ifmt_logic_t  MxdotpSrcIntFmtConfig = 4'b1000;      // INT8
+  localparam fpnew_pkg::fmt_logic_t   MxdotpDstFpFmtConfig  = 9'b100010000; // FP32, FP16ALT
+  localparam int unsigned             VectorSize            = 8;
+
+  // Do not change
+  localparam int unsigned SRC_WIDTH    = fpnew_pkg::max_fp_width(MxdotpSrcFpFmtConfig);
+  localparam int unsigned DST_WIDTH    = fpnew_pkg::max_fp_width(MxdotpDstFpFmtConfig);
+  localparam int unsigned SCALE_WIDTH  = 8;
+  localparam int unsigned NUM_OPERANDS = 2*VectorSize+1; // Two input vectors + accumulator (scale handled separately)
+  localparam int unsigned NUM_FORMATS  = fpnew_pkg::NUM_FP_FORMATS;
+  // ----------
+  // Constants
+  // ----------
+  // The super-format that can hold all formats
+  localparam fpnew_pkg::fp_encoding_t SUPER_FORMAT     = fpnew_pkg::super_format(MxdotpSrcFpFmtConfig);
+  localparam fpnew_pkg::fp_encoding_t SUPER_DST_FORMAT = fpnew_pkg::super_format(MxdotpDstFpFmtConfig);
+
+  localparam int unsigned SUPER_EXP_BITS     = SUPER_FORMAT.exp_bits;
+  localparam int unsigned SUPER_MAN_BITS     = SUPER_FORMAT.man_bits;
+  localparam int unsigned SUPER_DST_EXP_BITS = SUPER_DST_FORMAT.exp_bits;
+  localparam int unsigned SUPER_DST_MAN_BITS = SUPER_DST_FORMAT.man_bits;
+
+  // FP6 super format specific
+  localparam fpnew_pkg::fp_encoding_t FP6_SUPER_FORMAT = fpnew_pkg::super_format(9'b000000110); // FP6 & FP6ALT
+  localparam int unsigned FP6_EXP_BITS  = FP6_SUPER_FORMAT.exp_bits;
+  localparam int unsigned FP6_MAN_BITS  = FP6_SUPER_FORMAT.man_bits;
+  localparam int unsigned FP6_PREC_BITS = FP6_MAN_BITS + 1;
+
+  // FP4 specific
+  localparam int unsigned FP4_EXP_BITS  = fpnew_pkg::exp_bits(fpnew_pkg::FP4);
+  localparam int unsigned FP4_MAN_BITS  = fpnew_pkg::man_bits(fpnew_pkg::FP4);
+  localparam int unsigned FP4_PREC_BITS = FP4_MAN_BITS + 1;
+
+  // Precision bits 'p' include the implicit bit
+  localparam int unsigned PRECISION_BITS = SUPER_MAN_BITS + 1;
+  // Destination precision bits 'p_dst' include the implicit bit
+  localparam int unsigned DST_PRECISION_BITS = SUPER_DST_MAN_BITS + 1;
+
+  // Algorithm constants
+  localparam int unsigned ANCHOR               = 34; // Fractional point position
+  localparam int unsigned INT_BITS             = 32;
+  localparam int unsigned VECTOR_BITS          = $clog2(VectorSize);
+  localparam int unsigned PROD_SHIFT_WIDTH     = 1 + INT_BITS + ANCHOR;
+  localparam int unsigned SOP_FIXED_WIDTH      = VECTOR_BITS + PROD_SHIFT_WIDTH;
+  localparam int unsigned FIXED_SUM_WIDTH      = 1 + DST_PRECISION_BITS + 1 + (SOP_FIXED_WIDTH - 1); // |s|-Acc:24b-|R|-unsigned SoP:64+log2k-|
+  localparam int unsigned LZC_SUM_WIDTH        = FIXED_SUM_WIDTH + DST_PRECISION_BITS;
+  localparam int unsigned LZC_RESULT_WIDTH     = $clog2(LZC_SUM_WIDTH);
+  localparam int signed   MAX_ACC_SHIFT_AMOUNT = FIXED_SUM_WIDTH - DST_PRECISION_BITS - 1; // Maximum allowable shift, -1 for the sign bit
+  localparam int unsigned SOP_SHIFT            = ANCHOR - 2*SUPER_MAN_BITS; // Constant left shift amount for the SOP to align the fractional point
+
+  // FP6 specific
+  localparam int unsigned FP6_PROD_WIDTH       = 2*FP6_PREC_BITS + 1; // 2p+1 for the product
+  localparam int unsigned FP6_PROD_SHIFT_WIDTH = 2*(2**FP6_EXP_BITS-1-fpnew_pkg::bias(fpnew_pkg::FP6)) + FP6_PROD_WIDTH + 4; // 2*(2^e-1-bias) + 2p+1 + 4, (2^e-1-bias): max shift amount; +4 is due to the minimum value of the sum of exponents for FP6 (-4)
+
+  // FP4 specific
+  localparam int unsigned FP4_PROD_WIDTH       = 2*FP4_PREC_BITS + 1; // 2p+1 for the product
+  localparam int unsigned FP4_PROD_SHIFT_WIDTH = 2*(2**FP4_EXP_BITS-1-fpnew_pkg::bias(fpnew_pkg::FP4)) + FP4_PROD_WIDTH; // 2*(2^e-1-bias) + 2p+1, (2^e-1-bias): max shift amount
+
+  // Internal exponent width of FMA must accommodate all meaningful exponent values in order to avoid
+  // datapath leakage. This is either given by the exponent bits or the width of the LZC result.
+  // In most reasonable FP formats the internal exponent will be wider than the LZC result.
+  localparam int unsigned EXP_WIDTH          = SUPER_EXP_BITS + 1;
+  localparam int unsigned DST_EXP_WIDTH      = SUPER_DST_EXP_BITS + 2; // +2 for overflow handling
+  // Shift amount width: $clog2(DST_BIAS - ANCHOR + (scale_a+scale_b) + FIXED_SUM_WIDTH - 1)
+  localparam int unsigned SHIFT_AMOUNT_WIDTH = $clog2(fpnew_pkg::bias(fpnew_pkg::FP32) - ANCHOR + 2**(SCALE_WIDTH) - 1 + FIXED_SUM_WIDTH - 1);
+
+  // ----------------
+  // Type definition
+  // ----------------
+  typedef struct packed {
+    logic                      sign;
+    logic [SUPER_EXP_BITS-1:0] exponent;
+    logic [SUPER_MAN_BITS-1:0] mantissa;
+  } fp_src_t;
+  typedef struct packed {
+    logic                    sign;
+    logic [FP6_EXP_BITS-1:0] exponent;
+    logic [FP6_MAN_BITS-1:0] mantissa;
+  } fp6_src_t;
+  typedef struct packed {
+    logic                    sign;
+    logic [FP4_EXP_BITS-1:0] exponent;
+    logic [FP4_MAN_BITS-1:0] mantissa;
+  } fp4_src_t;
+  typedef struct packed {
+    logic                          sign;
+    logic [SUPER_DST_EXP_BITS-1:0] exponent;
+    logic [SUPER_DST_MAN_BITS-1:0] mantissa;
+  } fp_dst_t;
+
+  // ----------
+  // Functions
+  // ----------
+
+  // Returns the MXDOTP destination format config from the global FpFmtConfig.
+  // Only FP32 and FP16ALT are valid destination formats for MXDOTP.
+  function automatic fpnew_pkg::fmt_logic_t get_mxdotp_dst_fmts(fpnew_pkg::fmt_logic_t cfg);
+    automatic fpnew_pkg::fmt_logic_t res;
+    res = { cfg[fpnew_pkg::FP32],    // FP32
+            1'b0,                    // FP64
+            1'b0,                    // FP16
+            1'b0,                    // FP8
+            cfg[fpnew_pkg::FP16ALT], // FP16ALT
+            1'b0,                    // FP8ALT
+            1'b0,                    // FP6
+            1'b0,                    // FP6ALT
+            1'b0                     // FP4
+    };
+    return res;
+  endfunction
+
+  function automatic int unsigned bias_constant(fpnew_pkg::fp_format_e fmt);
+    unique case (fmt)
+      fpnew_pkg::FP32:    return 127; // 2^(8-1) - 1
+      fpnew_pkg::FP16:    return 15;  // 2^(5-1) - 1
+      fpnew_pkg::FP16ALT: return 127; // 2^(8-1) - 1,
+      fpnew_pkg::FP8:     return 15;  // 2^(5-1) - 1
+      fpnew_pkg::FP8ALT:  return 7;   // 2^(4-1) - 1
+      fpnew_pkg::FP6:     return 3;   // 2^(3-1) - 1
+      fpnew_pkg::FP6ALT:  return 1;   // 2^(2-1) - 1
+      fpnew_pkg::FP4:     return 1;   // 2^(2-1) - 1
+      default:            return fpnew_pkg::bias(fmt);
+    endcase
+  endfunction
+
+endpackage
diff --git a/src_files.yml b/src_files.yml
index 84348a98..8ba39f50 100644
--- a/src_files.yml
+++ b/src_files.yml
@@ -41,6 +41,10 @@ fpnew:
     src/fpnew_sdotp_multi.sv,
     src/fpnew_sdotp_multi_wrapper.sv,
     src/fpnew_noncomp.sv,
+    src/mxdotp/fpnew_mxdotp_multi_pkg.sv,
+    src/mxdotp/fpnew_mxdotp_multi_modules.sv,
+    src/fpnew_mxdotp_multi.sv,
+    src/fpnew_mxdotp_multi_wrapper.sv,
     src/fpnew_opgroup_block.sv,
     src/fpnew_opgroup_fmt_slice.sv,
     src/fpnew_opgroup_multifmt_slice.sv,