lambda7xx
diff --git a/‎.clang-format
+1 b/‎.clang-format
+1
diff --git a/‎.gitignore
+10 b/‎.gitignore
+10
diff --git a/‎README.md
+3 b/‎README.md
+3
diff --git a/‎benchmarks/cpp/README.md
-5 b/‎benchmarks/cpp/README.md
-5
diff --git a/‎benchmarks/cpp/bertBenchmark.cpp
+1-1 b/‎benchmarks/cpp/bertBenchmark.cpp
+1-1
@@ -59,6 +59,7 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Left
+QualifierAlignment: Right
 ReflowComments:  true
 SeparateDefinitionBlocks: Always
 SortIncludes:    CaseSensitive
 
@@ -17,6 +17,16 @@ venv/
 .local/
 .hypothesis/
 .idea/
+dump*/
+.trt-internal
+*.dot
+*.prof
+*.log
+*.pkl
+*.hdf5
+*.lock
+config.json
+/*.svg
 cpp/cmake-build-*
 cpp/.ccache/
 tensorrt_llm/libs
 
@@ -355,6 +355,9 @@ however, that it is recommended to use the C++ version.
 
 ## Troubleshooting
 
+* If you encounter accuracy issues in the generated text, you may want to increase
+  the internal precision in the attention layer. For that, pass the `--context_fmha_fp32_acc enable` to
+  `trtllm-build`.
 
 * It's recommended to add options `–shm-size=1g –ulimit memlock=-1` to the
   docker or nvidia-docker run command.  Otherwise you may see NCCL errors when
 
@@ -39,7 +39,6 @@ Take GPT-350M as an example for single GPU
 
 ```
 ./benchmarks/gptSessionBenchmark \
-    --model gpt_350m \
     --engine_dir "../../benchmarks/gpt_350m/" \
     --batch_size "1" \
     --input_output_len "60,20"
@@ -50,7 +49,6 @@ Take GPT-350M as an example for single GPU
 Take GPT-175B as an example for multiple GPUs
 ```
 mpirun -n 8 ./benchmarks/gptSessionBenchmark \
-    --model gpt_175b \
     --engine_dir "../../benchmarks/gpt_175b/" \
     --batch_size "1" \
     --input_output_len "60,20"
@@ -125,7 +123,6 @@ cd cpp/build
 Take GPT-350M as an example for single GPU V1 batching
 ```
 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type V1 \
     --dataset ../../benchmarks/cpp/preprocessed_dataset.json
@@ -135,7 +132,6 @@ Take GPT-350M as an example for single GPU V1 batching
 Take GPT-350M as an example for 2-GPU inflight batching
 ```
 mpirun -n 2 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2-ib/fp16/2-gpu/ \
     --type IFB \
     --dataset ../../benchmarks/cpp/preprocessed_dataset.json
@@ -165,7 +161,6 @@ Given a `static_emulated_batch_size` of `n` the server will wait for `n` request
 Take GPT-350M as an example for single GPU with static batching
 ```
 ./benchmarks/gptManagerBenchmark \
-    --model gpt \
     --engine_dir ../../examples/gpt/trt_engine/gpt2/fp16/1-gpu/ \
     --type IFB \
     --static_emulated_batch_size 32 \
 
@@ -237,7 +237,7 @@ int main(int argc, char* argv[])
         benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,
             logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());
     }
-    catch (const std::exception& e)
+    catch (std::exception const& e)
     {
         TLLM_LOG_ERROR(e.what());
         return 1;
Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ int main(int argc, char* argv[])`
`237`	`237`	`benchmarkBert(result["model"].as<std::string>(), result["engine_dir"].as<std::string>(), batchSizes, inLens,`
`238`	`238`	`logger, result["warm_up"].as<int>(), result["num_runs"].as<int>(), result["duration"].as<int>());`
`239`	`239`	`}`
`240`		`- catch (const std::exception& e)`
	`240`	`+ catch (std::exception const& e)`
`241`	`241`	`{`
`242`	`242`	`TLLM_LOG_ERROR(e.what());`
`243`	`243`	`return 1;`