@@ -789,16 +789,59 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
789
789
}
790
790
791
791
// this function applies automatic scaling to rope freq base when the desired context exceeds trained context
792
- static float CalcGradientAIRopeFreqBase (float original_rope_base, int n_ctx_train, int n_ctx_desired, bool is_solar )
792
+ static float CalcGradientAIRopeFreqBase (float original_rope_base, int n_ctx_train, int n_ctx_desired, GGUFArch model_arch )
793
793
{
794
794
if (n_ctx_desired <= n_ctx_train || n_ctx_desired <= 2048 )
795
795
{
796
796
return original_rope_base;
797
797
}
798
- float ctx_multiplier = (is_solar?8 .0f :1 .0f );
799
- float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318 ;
800
- float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318 ;
801
- return powf (original_rope_base, logf (chi_ctx_value) / logf (chi_ctx_train_value));
798
+ else
799
+ {
800
+ float ctx_multiplier = (model_arch==GGUFArch::ARCH_SOLAR?8 .0f :1 .0f );
801
+ float chi_ctx_train_value = (n_ctx_train * ctx_multiplier) / 6.28318 ;
802
+ float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318 ;
803
+ float gradient_ai_rope_freq_base_value = powf (original_rope_base, log10f (chi_ctx_value) / log10f (chi_ctx_train_value));
804
+
805
+ if (debugmode==1 )
806
+ {
807
+ printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
808
+ printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
809
+ printf (" Solar context multiplier (value:%.3f).\n " , ctx_multiplier);
810
+ printf (" Chi context train (value:%.3f).\n " , chi_ctx_train_value);
811
+ printf (" Chi chosen context (value:%.3f).\n " , chi_ctx_value);
812
+ printf (" Log Chi context train (value:%.3f).\n " , log10f (chi_ctx_train_value));
813
+ printf (" Log Chi chosen context (value:%.3f).\n " , log10f (chi_ctx_value));
814
+ printf (" RoPE Frequency Base value (value:%.3f).\n " , original_rope_base);
815
+ printf (" RoPE base calculated via Gradient AI formula. (value:%.1f).\n " , gradient_ai_rope_freq_base_value);
816
+ }
817
+
818
+ if (model_arch==GGUFArch::ARCH_SOLAR)
819
+ {
820
+ float extended_rope_positive_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / ((log10f (chi_ctx_value) * log10f (chi_ctx_train_value)) - (log10f (chi_ctx_value) + log10f (chi_ctx_train_value))));
821
+ float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
822
+ if (debugmode==1 )
823
+ {
824
+ printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
825
+ printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
826
+ }
827
+ return rope_freq_base_with_positive_offset;
828
+ }
829
+ // else if(model_arch==GGUFArch::ARCH_MISTRAL_LLAMA_1_AND_2)
830
+ // {
831
+ // float extended_rope_negative_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / (3.14159265358979323846 * 3.14159265358979323846));
832
+ // float rope_freq_base_with_negative_offset = gradient_ai_rope_freq_base_value / extended_rope_negative_offset_value;
833
+ // if(debugmode==1)
834
+ // {
835
+ // printf("Extended RoPE Negative Offset (divisor) for Llama 1 and 2 based models. (value:%.3f).\n", extended_rope_negative_offset_value);
836
+ // printf("RoPE base calculated via Gradient AI formula for Llama 1 and 2 based models. (value:%.1f).\n", rope_freq_base_with_negative_offset);
837
+ // }
838
+ // return rope_freq_base_with_negative_offset;
839
+ // }
840
+ else
841
+ {
842
+ return gradient_ai_rope_freq_base_value;
843
+ }
844
+ }
802
845
}
803
846
804
847
ModelLoadResult gpttype_load_model (const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta)
@@ -850,10 +893,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
850
893
else
851
894
{
852
895
// Set freq base for all, including non GGUF. If we are using GGUF, this will be overwritten with more accurate values later.
853
- rope_freq_base = CalcGradientAIRopeFreqBase (10000 .0f ,2048 ,kcpp_params->n_ctx ,false );
896
+ rope_freq_base = CalcGradientAIRopeFreqBase (10000 .0f ,2048 ,kcpp_params->n_ctx , GGUFArch::ARCH_DEFAULT );
854
897
if (file_format==FileFormat::GGUF_GENERIC)
855
898
{
856
- printf (" Using automatic RoPE scaling. If the model has customized RoPE settings, they will be used directly instead!\n " );
899
+ printf (" Using automatic RoPE scaling for GGUF. If the model has custom RoPE settings, they'll be used directly instead!\n " );
900
+ printf (" It means that the RoPE values written above will be replaced by the RoPE values indicated after loading.\n " );
857
901
}
858
902
else
859
903
{
@@ -1099,7 +1143,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
1099
1143
else
1100
1144
{
1101
1145
// Calculate rope_freq_base using the gradientAI formula, solar requires ctx *8 for correct scaling
1102
- rope_freq_base = CalcGradientAIRopeFreqBase (llamamodel->hparams .rope_freq_base_train , file_format_meta.n_ctx_train , kcpp_params->n_ctx , file_format_meta.model_architecture ==GGUFArch::ARCH_SOLAR );
1146
+ rope_freq_base = CalcGradientAIRopeFreqBase (llamamodel->hparams .rope_freq_base_train , file_format_meta.n_ctx_train , kcpp_params->n_ctx , file_format_meta.model_architecture );
1103
1147
llama_ctx_params.rope_freq_base = rope_freq_base;
1104
1148
llama_ctx_params.rope_freq_scale = rope_freq_scale;
1105
1149
printf (" Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n " , rope_freq_scale, rope_freq_base);
0 commit comments