pyth-network · May 17, 2024
diff --git a/‎program/c/makefile
+1 b/‎program/c/makefile
+1
diff --git a/‎program/c/src/oracle/model/price_model.c
+81-73 b/‎program/c/src/oracle/model/price_model.c
+81-73
diff --git a/‎program/c/src/oracle/model/price_model.h
+82-11 b/‎program/c/src/oracle/model/price_model.h
+82-11
diff --git a/‎program/c/src/oracle/model/test_price_model.c
+3-29 b/‎program/c/src/oracle/model/test_price_model.c
+3-29
diff --git a/‎program/c/src/oracle/sort/sort_stable_base_gen.c
+94 b/‎program/c/src/oracle/sort/sort_stable_base_gen.c
+94
diff --git a/‎program/c/src/oracle/sort/test_sort_stable.c
+73 b/‎program/c/src/oracle/sort/test_sort_stable.c
+73
diff --git a/‎program/c/src/oracle/sort/tmpl/sort_stable.c
+195 b/‎program/c/src/oracle/sort/tmpl/sort_stable.c
+195
diff --git a/‎program/c/src/oracle/sort/tmpl/sort_stable_base.c
+21 b/‎program/c/src/oracle/sort/tmpl/sort_stable_base.c
+21
diff --git a/‎program/c/src/oracle/upd_aggregate.h
+2-1 b/‎program/c/src/oracle/upd_aggregate.h
+2-1
diff --git a/‎program/rust/src/tests/mod.rs
-1 b/‎program/rust/src/tests/mod.rs
-1
diff --git a/‎program/rust/src/tests/pyth_simulator.rs
+1-2 b/‎program/rust/src/tests/pyth_simulator.rs
+1-2
diff --git a/‎program/rust/src/tests/test_benchmark.rs
-122 b/‎program/rust/src/tests/test_benchmark.rs
-122
diff --git a/‎program/rust/src/tests/test_c_code.rs
+8 b/‎program/rust/src/tests/test_c_code.rs
+8
@@ -41,6 +41,7 @@ cpyth-native: features.h
 test: features.h
 	mkdir -p $(OUT_DIR)/test/
 	gcc -c ./src/oracle/model/test_price_model.c -o $(OUT_DIR)/test/test_price_model.o -fPIC
+	gcc -c ./src/oracle/sort/test_sort_stable.c -o $(OUT_DIR)/test/test_sort_stable.o -fPIC
 	gcc -c ./src/oracle/util/test_align.c -o $(OUT_DIR)/test/test_align.o -fPIC
 	gcc -c ./src/oracle/util/test_avg.c -o $(OUT_DIR)/test/test_avg.o -fPIC
 	gcc -c ./src/oracle/util/test_hash.c -o $(OUT_DIR)/test/test_hash.o -fPIC
 
@@ -1,104 +1,112 @@
 #include "price_model.h"
 #include "../util/avg.h" /* For avg_2_int64 */
 
-/*
- * In-place bottom-up Heapsort implementation optimized for minimal compute unit usage in BPF.
- *
- * Initially it creates a max heap in linear time and then to get ascending
- * order it swaps the root with the last element and then sifts down the root.
- *
- * The number of comparisions in average case is nlgn + O(1) and in worst case is
- * 1.5nlgn + O(n).
- *
- * There are a lot of (j-1) or (j+1) math in the code which can be optimized by
- * thinking of a as 1-based array. Fortunately, BPF compiler optimizes that for us.
- */
-void heapsort(int64_t * a, uint64_t n) {
-  if (n <= 1) return;
-
-  /*
-   * This is a bottom-up heapify which is linear in time.
-   */
-  for (uint64_t i = n / 2 - 1;; --i) {
-    int64_t root = a[i];
-    uint64_t j = i * 2 + 1;
-    while (j < n) {
-      if (j + 1 < n && a[j] < a[j + 1]) ++j;
-      if (root >= a[j]) break;
-      a[(j - 1) / 2] = a[j];
-      j = j * 2 + 1;
-    }
-    a[(j - 1) / 2] = root;
-
-    if (i == 0) break;
-  }
-
-  for (uint64_t i = n - 1; i > 0; --i) {
-    int64_t tmp = a[0];
-    a[0] = a[i];
-    a[i] = tmp;
-
-    int64_t root = a[0];
-    uint64_t j = 1;
-    while (j < i) {
-      if (j + 1 < i && a[j] < a[j + 1]) ++j;
-      if (root >= a[j]) break;
-      a[(j - 1) / 2] = a[j];
-      j = j * 2 + 1;
-    }
-    a[(j - 1) / 2] = root;
-  }
-}
+#define SORT_NAME  int64_sort_ascending
+#define SORT_KEY_T int64_t
+#include "../sort/tmpl/sort_stable.c"
 
-/*
- * Find the 25, 50, and 75 percentiles of the given quotes using heapsort.
- *
- * This implementation optimizes the price_model_core function for minimal compute unit usage in BPF.
- *
- * In Solana, each BPF instruction costs 1 unit of compute and is much different than a native code
- * execution time. Here are some of the differences:
- * 1. There is no cache, so memory access is much more expensive.
- * 2. The instruction set is very minimal, and there are only 10 registers available.
- * 3. The BPF compiler is not very good at optimizing the code.
- * 4. The stack size is limited and having extra stack frame has high overhead.
- *
- * This implementation is chosen among other implementations such as merge-sort, quick-sort, and quick-select
- * because it is very fast, has small number of instructions, and has a very small memory footprint by being
- * in-place and is non-recursive and has a nlogn worst-case time complexity.
- */
 int64_t *
 price_model_core( uint64_t  cnt,
                   int64_t * quote,
                   int64_t * _p25,
                   int64_t * _p50,
-                  int64_t * _p75) {
-  heapsort(quote, cnt);
+                  int64_t * _p75,
+                  void    * scratch ) {
+
+  /* Sort the quotes.  The sorting implementation used here is a highly
+     optimized mergesort (merge with an unrolled insertion sorting
+     network small n base cases).  The best case is ~0.5 n lg n compares
+     and the average and worst cases are ~n lg n compares.
+
+     While not completely data oblivious, this has quite low variance in
+     operation count practically and this is _better_ than quicksort's
+     average case and quicksort's worst case is a computational
+     denial-of-service and timing attack vulnerable O(n^2).  Unlike
+     quicksort, this is also stable (but this stability does not
+     currently matter ... it might be a factor in future models).
+
+     A data oblivious sorting network approach might be viable here with
+     and would have a completely deterministic operations count.  It
+     currently isn't used as the best known practical approaches for
+     general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
+     while the application probably doesn't need perfect obliviousness,
+     mergesort is still moderately oblivious and the application can
+     benefit from mergesort's lower operations cost.  (The main drawback
+     of mergesort over quicksort is that it isn't in place, but memory
+     footprint isn't an issue here.)
+
+     Given the operations cost model (e.g. cache friendliness is not
+     incorporated), a radix sort might be viable here (O(n) in best /
+     average / worst).  It currently isn't used as we expect invocations
+     with small-ish n to be common and radix sort would be have large
+     coefficients on the O(n) and additional fixed overheads that would
+     make it more expensive than mergesort in this regime.
+
+     Note: price_model_cnt_valid( cnt ) implies
+     int64_sort_ascending_cnt_valid( cnt ) currently.
+
+     Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
+
+  int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
+
+  /* Extract the p25
+
+     There are many variants with subtle tradeoffs here.  One option is
+     to interpolate when the ideal p25 is bracketed by two samples (akin
+     to the p50 interpolation above when the number of quotes is even).
+     That is, for p25, interpolate between quotes floor((cnt-2)/4) and
+     ceil((cnt-2)/4) with the weights determined by cnt mod 4.  The
+     current preference is to not do that as it is slightly more
+     complex, doesn't exactly always minimize the current loss function
+     and is more exposed to the confidence intervals getting skewed by
+     bum quotes with the number of quotes is small.
+
+     Another option is to use the inside quote of the above pair.  That
+     is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
+     (cnt+1)>>2.  The current preference is not to do this as, though
+     this has stronger bum quote robustness, it results in p25==p50==p75
+     when cnt==3.  (In this case, the above wants to do an interpolation
+     between quotes 0 and 1 to for the p25 and between quotes 1 and 2
+     for the p75.  But limiting to just the inside quote results in
+     p25/p50/p75 all using the median quote.)
+
+     A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
+     This is simple, has the same asymptotic behavior for large cnt, has
+     good behavior in the cnt==3 case and practically as good bum quote
+     rejection in the moderate cnt case. */
 
-  /* Extract the p25 */
   uint64_t p25_idx = cnt >> 2;
-  *_p25 = quote[p25_idx];
+
+  *_p25 = sort_quote[p25_idx];
 
   /* Extract the p50 */
+
   if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
+
     uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
-    *_p50 = quote[p50_idx];
+
+    *_p50 = sort_quote[p50_idx];
+
   } else { /* Even number of quotes (at least 2) */
+
     uint64_t p50_idx_right = cnt >> 1;                    /* == ceil((cnt-1)/2)> 0 */
     uint64_t p50_idx_left  = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
 
-    int64_t vl = quote[p50_idx_left];
-    int64_t vr = quote[p50_idx_right];
+    int64_t vl = sort_quote[p50_idx_left ];
+    int64_t vr = sort_quote[p50_idx_right];
 
     /* Compute the average of vl and vr (with floor / round toward
        negative infinity rounding and without possibility of
        intermediate overflow). */
+
     *_p50 = avg_2_int64( vl, vr );
   }
 
   /* Extract the p75 (this is the mirror image of the p25 case) */
 
   uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
-  *_p75 = quote[p75_idx];
 
-  return quote;
+  *_p75 = sort_quote[p75_idx];
+
+  return sort_quote;
 }
@@ -8,20 +8,91 @@
 extern "C" {
 #endif
 
-/*
- * This function computes the p25, p50 and p75 percentiles of the given quotes and
- * writes them to the given pointers. It also returns the sorted quotes array. Being
- * sorted is not necessary for this model to work, and is only relied upon by the
- * tests to verify the correctness of the model with more confidence.
- *
- * The quote array might get modified by this function.
- */
-int64_t *
-price_model_core( uint64_t  cnt,       /* Number of elements in quote */
+/* Returns the minimum and maximum number of quotes the implementation
+   can handle */
+
+static inline uint64_t
+price_model_quote_min( void ) {
+  return (uint64_t)1;
+}
+
+static inline uint64_t
+price_model_quote_max( void ) {
+  return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
+}
+
+/* price_model_cnt_valid returns non-zero if cnt is a valid value or
+   zero if not. */
+
+static inline int
+price_model_cnt_valid( uint64_t cnt ) {
+  return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
+}
+
+/* price_model_scratch_footprint returns the number of bytes of scratch
+   space needed for an arbitrarily aligned scratch region required by
+   price_model to handle price_model_quote_min() to cnt quotes
+   inclusive. */
+
+static inline uint64_t
+price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
+  /* cnt int64_t's plus worst case alignment padding, no overflow
+     possible as cnt is valid at this point */
+  return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
+}
+
+/* price_model_core minimizes (to quote precision in a floor / round
+   toward negative infinity sense) the loss model of the given quotes.
+   Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
+   large ... typically a multiple of 3 but this is not required,
+   quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
+   p50, p75 point to where to write model outputs, scratch points to a
+   suitable footprint scratch region).
+
+   Returns a pointer to the quotes sorted in ascending order.  As such,
+   the min and max and any other rank statistic can be extracted easily
+   on return.  This location will either be quote itself or to a
+   location in scratch.  Use price_model below for a variant that always
+   replaces quote with the sorted quotes (potentially has extra ops for
+   copying).  Further, on return, *_p25, *_p50, *_p75 will hold the loss
+   model minimizing values for the input quotes and the scratch region
+   was clobbered.
+
+   Scratch points to a memory region of arbitrary alignment with at
+   least price_model_scratch_footprint( cnt ) bytes and it will be
+   clobbered on output.  It is sufficient to use a normally aligned /
+   normally allocated / normally declared array of cnt int64_t's.
+
+   The cost of this function is a fast and low variance (but not
+   completely data oblivious) O(cnt lg cnt) in the best / average /
+   worst cases.  This function uses no heap / dynamic memory allocation.
+   It is thread safe provided it passed non-conflicting quote, output
+   and scratch arrays.  It has a bounded call depth ~lg cnt <= ~64 (this
+   could reduce to O(1) by using a non-recursive sort/select
+   implementation under the hood if desired). */
+
+int64_t *                              /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
+price_model_core( uint64_t  cnt,       /* Assumes price_model_cnt_valid( cnt ) is true */
                   int64_t * quote,     /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
                   int64_t * _p25,      /* Assumes *_p25 is safe to write to the p25 model output */
                   int64_t * _p50,      /* Assumes *_p50 " */
-                  int64_t * _p75);     /* Assumes *_p75 " */
+                  int64_t * _p75,      /* Assumes *_p75 " */
+                  void    * scratch ); /* Assumes a suitable scratch region */
+
+/* Same as the above but always returns quote and quote always holds the
+   sorted quotes on return. */
+
+static inline int64_t *
+price_model( uint64_t  cnt,
+             int64_t * quote,
+             int64_t * _p25,
+             int64_t * _p50,
+             int64_t * _p75,
+             void    * scratch ) {
+  int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
+  if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
+  return quote;
+}
 
 #ifdef __cplusplus
 }
 
@@ -19,38 +19,13 @@ int test_price_model() {
   prng_t _prng[1];
   prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
 
-# define N 192
+# define N 96
 
   int64_t quote0 [N];
   int64_t quote  [N];
   int64_t val    [3];
+  int64_t scratch[N];
 
-  /* Brute force validate small sizes via the 0-1 principle. */
-  for( int cnt=0; cnt<=24; cnt++ ) {
-    for( long mask=0L; mask<(1L<<cnt); mask++ ) {
-      for( int i=0; i<cnt; i++ ) quote0[i] = (int64_t) ((mask>>i) & 1L);
-
-      memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
-      if( price_model_core( cnt, quote, val+0, val+1, val+2)!=quote ) { printf( "FAIL (01-compose)\n" ); return 1; }
-
-      /* Validate the results */
-
-      /* Although being sorted is not necessary it gives us more confidence about the correctness of the model */
-      qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
-      if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (01-sort)\n" ); return 1; }
-
-      uint64_t p25_idx = cnt>>2;
-      uint64_t p50_idx = cnt>>1;
-      uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
-      uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);
-
-      if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (01-p25)\n" ); return 1; }
-      if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (01-p50)\n" ); return 1; }
-      if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (01-p75)\n" ); return 1; }
-    }
-  }
-
-  /* Test using randomized inputs */
   for( int iter=0; iter<10000000; iter++ ) {
 
     /* Generate a random test */
@@ -61,11 +36,10 @@ int test_price_model() {
     /* Apply the model */
 
     memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
-    if( price_model_core( cnt, quote, val+0, val+1, val+2)!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
+    if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
 
     /* Validate the results */
 
-    /* Although being sorted is not necessary it gives us more confidence about the correctness of the model */
     qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
     if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }
 
 
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+void
+sort_gen( int n ) {
+
+# if 0
+
+  /* In register variant (PSEUDO OPS ~ 9+4n+3n(n-1))
+     Assumes switch ~ 3 PSEUDO OPS (LDA,LD,JMP) -> 3 switch statements / 9 pseudo ops
+     Assumes load   ~ 2 PSEUDO OPS (LDA,LD)     -> n loads             / 2n pseudo ops
+     Assumes store  ~ "
+     Assumes cswap  ~ 6 PSEUDO OPS (CMP,MOV,TESTEQ,CMOV,TESTNEQ,CMOV)  / 6*0.5*n*(n-1) pseudo ops */
+
+//printf( "static inline key_t *              /* returns (sorted) x */\n" );
+//printf( "sort_network_stable( key_t * x,    /* indexed [0,n) */\n" );
+//printf( "                     ulong   n ) { /* assumes n in [0,%i) */\n", n );
+  printf( "  int c;\n" );
+  printf( "  key_t t" );
+  for( int i=0; i<n; i++ ) printf( ", x%i", i );
+  printf( ";\n" );
+  printf( "\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n; i; i-- ) printf( "  case %2iUL: x%i = x[%i]; /* fall through */\n", i, i-1, i-1 );
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+  printf( "\n" );
+  printf( "# define _(i,j) c = BEFORE( x##j, x##i ); t = x##i; x##i = c ? x##j : x##i; x##j = c ? t : x##j\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n-1; i; i-- ) {
+    printf( "  case %2iUL:", i+1 );
+    for( int j=0; j<i; j++ ) printf( " _(%2i,%2i);", j,j+1 );
+    printf( " /* fall through */\n" );
+  }
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+  printf( "# undef _\n" );
+  printf( "\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n; i; i-- ) printf( "  case %2iUL: x[%i] = x%i; /* fall through */\n", i, i-1, i-1 );
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+
+//printf( "\n" );
+//printf( "  return x;\n" );
+//printf( "}\n" );
+
+# else
+
+  /* Memory variant (PSEUDO OPS ~ 3+4.5n(n-1))
+     Assumes switch ~ 3 PSEUDO OPS (LDA,LD,JMP) -> 3 pseudo ops
+     Assumes cswap  ~ 9 PSEUDO OPS (LDA,LDA,LD,LD,CMP,CMOV,CMOV,ST,ST) / 9*0.5*n*(n-1) pseudo ops */
+
+//printf( "static inline key_t *              /* returns (sorted) x */\n" );
+//printf( "sort_network_stable( key_t * x,    /* indexed [0,n) */\n" );
+//printf( "                     ulong   n ) { /* assumes n in [0,%i) */\n", n );
+
+  printf( "  do { /* BEGIN AUTOGENERATED CODE (n=%2i) *****************************/\n", n );
+  printf( "    /* This network of comparators and fallthroughs implement a sorting network representation\n" );
+  printf( "       of an insertion sort. Each case acts as a sort pass with the fallthrough falling through\n" );
+  printf( "       to smaller ranges of the input. */\n");
+  printf( "#   define SORT_STABLE_CE(i,j) u = x[(SORT_IDX_T)i]; v = x[(SORT_IDX_T)j]; c = SORT_BEFORE( v, u ); x[(SORT_IDX_T)i] = c ? v : u; x[(SORT_IDX_T)j] = c ? u : v\n" );
+  printf( "    int        c;\n" );
+  printf( "    SORT_KEY_T u;\n" );
+  printf( "    SORT_KEY_T v;\n" );
+  printf( "    switch( n ) {\n" );
+  for( int i=n-1; i>=0; i-- ) {
+    printf( "    case (SORT_IDX_T)%2i:", i+1 );
+    for( int j=0; j<i; j++ ) printf( " SORT_STABLE_CE(%2i,%2i);", j,j+1 );
+    printf( " /* fall through */\n" );
+  }
+  printf( "    case (SORT_IDX_T) 0: return x;\n" );
+  printf( "    default: break;\n" );
+  printf( "    }\n" );
+  printf( "#   undef SORT_STABLE_CE\n" );
+  printf( "  } while(0); /* END AUTOGENERATED CODE *******************************/\n" );
+
+//printf( "\n" );
+//printf( "  return x;\n" );
+//printf( "}\n" );
+
+# endif
+
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  if( argc!=2 ) { fprintf( stderr, "Usage: %s [max_base_case]\n", argv[0] ); return 1; }
+  int n = atoi( argv[1] );
+  if( n<1 ) { fprintf( stderr, "n (%i) must be positive\n", n ); return 1; }
+  sort_gen( n );
+  return 0;
+}
@@ -0,0 +1,73 @@
+#include <stdio.h>
+#include "../util/util.h"
+
+#define BEFORE(i,j) (((i)>>16)<((j)>>16))
+
+#define SORT_NAME        sort
+#define SORT_KEY_T       int
+#define SORT_IDX_T       int
+#define SORT_BEFORE(i,j) BEFORE(i,j)
+#include "tmpl/sort_stable.c"
+
+int test_sort_stable() {
+
+# define N 96
+  int x[N];
+  int y[N];
+  int w[N];
+
+  /* Brute force validate small sizes via the 0-1 principle (with
+     additional information in the keys to validate stability as well). */
+
+  for( int n=0; n<=24; n++ ) {
+    for( long b=0L; b<(1L<<n); b++ ) {
+      for( int i=0; i<n; i++ ) x[i] = (((int)((b>>i) & 1L))<<16) | i;
+      for( int i=0; i<n; i++ ) w[i] = x[i];
+
+      int * z = sort_stable( x,n, y );
+
+      /* Make sure that z is a permutation of input data */
+      for( int i=0; i<n; i++ ) {
+        int j = z[i] & (int)0xffff; /* j is the index where z was initially */
+        if( j<0 || j>=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; }
+        w[j] = -1; /* Mark that this entry has already been confirmed */
+      }
+      for( int i=0; i<n; i++ ) if( w[i]!=-1 ) { printf( "FAIL (perm)\n" ); return 1; }
+
+      /* Make sure that z is in order and stable */
+      for( int i=1; i<n; i++ )
+        if( z[i]<=z[i-1] ) { printf( "FAIL (%s, b=%lx)\n", BEFORE( z[i], z[i-1] ) ? "order" : "stable", b ); return 1; }
+    }
+  }
+
+  /* Randomized validation for larger sizes */
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  for( int iter=0; iter<10000000; iter++ ) {
+
+    int n = (int)(prng_uint32( prng ) % (uint32_t)(N+1)); /* In [0,N], approx uniform IID */
+    for( int i=0; i<n; i++ ) x[i] = (int)((prng_uint32( prng ) & UINT32_C( 0x00ff0000 )) | (uint32_t)i);
+    for( int i=0; i<n; i++ ) w[i] = x[i];
+
+    int * z = sort_stable( x,n, y );
+
+    /* Make sure that z is a permutation of input data */
+    for( int i=0; i<n; i++ ) {
+      int j = z[i] & (int)0xffff; /* j is the index where z was initially */
+      if( j<0 || j>=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; }
+      w[j] = -1; /* Mark that this entry has already been confirmed */
+    }
+    for( int i=0; i<n; i++ ) if( w[i]!=-1 ) { printf( "FAIL (perm)\n" ); return 1; }
+
+    /* Make sure that z is in order and stable */
+    for( int i=1; i<n; i++ )
+      if( z[i]<=z[i-1] ) { printf( "FAIL (%s)\n", BEFORE( z[i], z[i-1] ) ? "order" : "stable" ); return 1; }
+  }
+
+  prng_delete( prng_leave( prng ) );
+  return 0;
+}
+
+#undef BEFORE
@@ -0,0 +1,195 @@
+/* Usage:
+
+     #define SORT_NAME  mysort
+     #define SORT_KEY_T mykey_t
+     #include "sort_stable_impl.c"
+
+   This will instantiate the following APIs:
+
+      // Returns non-zero if n is a supported sort size and zero if not.
+      // Unsupported values are negative n (only applicable for signed
+      // indexing types) or unreasonably large n (such that the scratch
+      // space requirement would be bigger than UINT64_MAX).
+
+      static inline int
+      mysort_stable_cnt_valid( uint64_t cnt );
+
+      // Return the footprint required for a scratch space of any
+      // alignment sufficient for sorting up to n items inclusive.
+      // Returns 0 if cnt is not valid or no scratch is needed.
+
+      static inline uint64_t
+      mysort_stable_scratch_footprint( uint64_t cnt );
+
+      // Sort elements of keys into an ascending order.  Algorithm has a
+      // best case of ~0.5 cnt lg cnt and an average and worst case of
+      // cnt lg cnt such that it is moderately resistant to timing and
+      // computational DOS attacks.  Further, the sort is stable.  The
+      // values in scratch are irrelevant on input.  Returns where the
+      // sorted data ended up (either key or ALIGN_UP(scratch,mykey_t)).
+      // That is, if this returns key, the values in key are the stably
+      // sorted data and scratch was clobbered.  Otherwise, the values
+      // at ALIGN_UP(scratch,mykey_t) are the stably sorted data and key
+      // was clobbered.  Users wanting the data in a particular location
+      // can copy as necessary (allowing this flexibility minimizes the
+      // amount of copying needed to do the sorting).  E.g.:
+      //
+      //   mykey_t * tmp = mysort_stable( key, cnt, scratch );
+      //   if( tmp!=key ) memcpy( key, tmp, cnt*sizeof(mykey_t) );
+      //
+      // scratch points to a scratch memory region of any alignment with
+      // room for mysort_stable_scratch_footprint( cnt ) bytes.  (Any
+      // normally declared / normally allocated region with mykey_t
+      // compatible alignment and space for cnt mykey_t's will work
+      // too.)
+      //
+      // FIXME: CONSIDER RETURNING NULL IF BAD INPUT ARGS
+
+      static mykey_t *
+      mysort_stable( mykey_t * key,       // Indexed [0,n)
+                     uint64_t  cnt,       // Assumes mysort_stable_cnt_valid( cnt ) is true
+                     void *    scratch ); // Pointer to suitable scratch region
+
+   This can be included multiple types with different names / parameters
+   to define many family of sorts that might be useful for a compilation
+   unit.
+
+   Other defines exist to change the sort criteria / direction, linkage
+   and so forth.  See below for details. */
+
+#include "../../util/compat_stdint.h" /* For uint64_t */
+#include "../../util/align.h"         /* For ALIGN_UP */
+
+#ifndef SORT_NAME
+#error "Define SORT_NAME"
+#endif
+
+#ifndef SORT_KEY_T
+#error "Define SORT_KEY_T; nominally a POD (plain-old-data) type"
+#endif
+
+/* Define SORT_IDX_T to specify the data type used to index key arrays.
+   Default is uint64_t. */
+
+#ifndef SORT_IDX_T
+#define SORT_IDX_T uint64_t
+#endif
+
+/* Define SORT_BEFORE to specify how sorted keys should be ordered.
+   Default is ascending as defined by the "<" operator for the type.
+   SORT_BEFORE(u,v) should be non-zero if key u should go strictly
+   before key v and zero otherwise. */
+
+#ifndef SORT_BEFORE
+#define SORT_BEFORE(u,v) ((u)<(v))
+#endif
+
+/* Define SORT_STATIC to specify the type of linkage the non-inlined
+   APIs should have (e.g. if defined to nothing, these will have
+   external linkage).  Default is static linkage. */
+
+#ifndef SORT_STATIC
+#define SORT_STATIC static
+#endif
+
+/* Define SORT_STATIC_INLINE to specify the type of linkage inlined
+   APIs should have (e.g. if defined to nothing, these will have
+   non-inlined external linkage).  Default is static inline linkage. */
+
+#ifndef SORT_STATIC_INLINE
+#define SORT_STATIC_INLINE static inline
+#endif
+
+/* Some macro preprocessor helpers */
+
+#define SORT_C3(a,b,c)a##b##c
+#define SORT_XC3(a,b,c)SORT_C3(a,b,c)
+#define SORT_IMPL(impl)SORT_XC3(SORT_NAME,_,impl)
+
+SORT_STATIC_INLINE int
+SORT_IMPL(stable_cnt_valid)( SORT_IDX_T cnt ) {
+  /* Written this way for complier warning free signed SORT_IDX_T and/or
+     byte size SORT_KEY_T support (e.g. compiler often will warn to the
+     effect "n>=0 always true" if idx is an unsigned type or
+     "n<=UINT64_MAX always true" if key is a byte type). */
+  static uint64_t const max = ((UINT64_MAX - (uint64_t)alignof(SORT_KEY_T) + (uint64_t)1) / (uint64_t)sizeof(SORT_KEY_T));
+  return !cnt || (((SORT_IDX_T)0)<cnt && ((uint64_t)cnt)<max) || ((uint64_t)cnt)==max;
+}
+
+SORT_STATIC_INLINE uint64_t
+SORT_IMPL(stable_scratch_footprint)( SORT_IDX_T cnt ) {
+  if( !SORT_IMPL(stable_cnt_valid)( cnt ) ) return (uint64_t)0;
+  /* Guaranteed not to overflow given a valid cnt */
+  return ((uint64_t)sizeof (SORT_KEY_T))*(uint64_t)cnt /* Space for the n SORT_KEY_T's */
+       + ((uint64_t)alignof(SORT_KEY_T))-(uint64_t)1;  /* Worst case alignment padding */
+}
+
+SORT_STATIC SORT_KEY_T *
+SORT_IMPL(stable_node)( SORT_KEY_T * x,
+                        SORT_IDX_T   n,
+                        SORT_KEY_T * t ) {
+
+  /* Optimized handling of base cases */
+
+# include "sort_stable_base.c"
+
+  /* Note that n is at least 2 at this point */
+  /* Break input into approximately equal halves and sort them */
+
+  SORT_KEY_T * xl = x;
+  SORT_KEY_T * tl = t;
+  SORT_IDX_T   nl = n >> 1;
+  SORT_KEY_T * yl = SORT_IMPL(stable_node)( xl,nl, tl );
+
+  SORT_KEY_T * xr = x + nl;
+  SORT_KEY_T * tr = t + nl;
+  SORT_IDX_T   nr = n - nl;
+  SORT_KEY_T * yr = SORT_IMPL(stable_node)( xr,nr, tr );
+
+  /* If left subsort result ended up in orig array, merge into temp
+     array.  Otherwise, merge into orig array. */
+
+  if( yl==xl ) x = t;
+
+  /* At this point, note that yl does not overlap with the location for
+     merge output at this point.  yr might overlap (with the right half)
+     with the location for merge output but this will still work in that
+     case. */
+
+  SORT_IDX_T i = (SORT_IDX_T)0;
+  SORT_IDX_T j = (SORT_IDX_T)0;
+  SORT_IDX_T k = (SORT_IDX_T)0;
+
+  /* Note that nl and nr are both at least one at this point so at least
+     one iteration of the loop body is necessary. */
+
+  for(;;) { /* Minimal C language operations */
+    if( SORT_BEFORE( yr[k], yl[j] ) ) {
+      x[i++] = yr[k++];
+      if( k>=nr ) { /* append left  stragglers (at least one) */ do x[i++] = yl[j++]; while( j<nl ); break; }
+    } else {
+      x[i++] = yl[j++];
+      if( j>=nl ) { /* append right stragglers (at least one) */ do x[i++] = yr[k++]; while( k<nr ); break; }
+    }
+  }
+
+  return x;
+}
+
+SORT_STATIC_INLINE SORT_KEY_T *
+SORT_IMPL(stable)( SORT_KEY_T * key,
+                   SORT_IDX_T   cnt,        /* Assumed valid cnt */
+                   void       * scratch ) {
+  return SORT_IMPL(stable_node)( key, cnt, ALIGN_UP( scratch, SORT_KEY_T ) );
+}
+
+#undef SORT_IMPL
+#undef SORT_XC3
+#undef SORT_C3
+
+#undef SORT_STATIC_INLINE
+#undef SORT_STATIC
+#undef SORT_BEFORE
+#undef SORT_IDX_T
+#undef SORT_KEY_T
+#undef SORT_NAME
@@ -0,0 +1,21 @@
+  do { /* BEGIN AUTOGENERATED CODE (n= 7) *****************************/
+    /* This network of comparators and fallthroughs implement a sorting network representation
+       of an insertion sort. Each case acts as a sort pass with the fallthrough falling through
+       to smaller ranges of the input. */
+#   define SORT_STABLE_CE(i,j) u = x[(SORT_IDX_T)i]; v = x[(SORT_IDX_T)j]; c = SORT_BEFORE( v, u ); x[(SORT_IDX_T)i] = c ? v : u; x[(SORT_IDX_T)j] = c ? u : v
+    int        c;
+    SORT_KEY_T u;
+    SORT_KEY_T v;
+    switch( n ) {
+    case (SORT_IDX_T) 7: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); SORT_STABLE_CE( 4, 5); SORT_STABLE_CE( 5, 6); /* fall through */
+    case (SORT_IDX_T) 6: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); SORT_STABLE_CE( 4, 5); /* fall through */
+    case (SORT_IDX_T) 5: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); /* fall through */
+    case (SORT_IDX_T) 4: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); /* fall through */
+    case (SORT_IDX_T) 3: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); /* fall through */
+    case (SORT_IDX_T) 2: SORT_STABLE_CE( 0, 1); /* fall through */
+    case (SORT_IDX_T) 1: /* fall through */
+    case (SORT_IDX_T) 0: return x;
+    default: break;
+    }
+#   undef SORT_STABLE_CE
+  } while(0); /* END AUTOGENERATED CODE *******************************/
@@ -188,7 +188,8 @@ static inline bool upd_aggregate( pc_price_t *ptr, uint64_t slot, int64_t timest
     // note: numv>0 and nprcs = 3*numv at this point
     int64_t agg_p25;
     int64_t agg_p75;
-    price_model_core( (uint64_t)nprcs, prcs, &agg_p25, &agg_price, &agg_p75 );
+    int64_t scratch[ PC_NUM_COMP * 3 ]; // ~0.75KiB for current PC_NUM_COMP (FIXME: DOUBLE CHECK THIS FITS INTO STACK FRAME LIMIT)
+    price_model_core( (uint64_t)nprcs, prcs, &agg_p25, &agg_price, &agg_p75, scratch );
 
     // get the left and right confidences
     // note that as valid quotes have positive prices currently and
 
@@ -3,7 +3,6 @@ mod test_add_price;
 mod test_add_product;
 mod test_add_publisher;
 mod test_aggregation;
-mod test_benchmark;
 mod test_c_code;
 mod test_check_valid_signable_account_or_permissioned_funding_account;
 mod test_del_price;
 
@@ -55,7 +55,6 @@ use {
     },
     solana_sdk::{
         account::Account,
-        commitment_config::CommitmentLevel,
         signature::{
             Keypair,
             Signer,
@@ -207,7 +206,7 @@ impl PythSimulator {
 
         self.context
             .banks_client
-            .process_transaction_with_commitment(transaction, CommitmentLevel::Processed)
+            .process_transaction(transaction)
             .await
     }
 
 
@@ -5,6 +5,7 @@ mod c {
     #[link(name = "cpyth-test")]
     extern "C" {
         pub fn test_price_model() -> i32;
+        pub fn test_sort_stable() -> i32;
         pub fn test_align() -> i32;
         pub fn test_avg() -> i32;
         pub fn test_hash() -> i32;
@@ -21,6 +22,13 @@ fn test_price_model() {
     }
 }
 
+#[test]
+fn test_sort_stable() {
+    unsafe {
+        assert_eq!(c::test_sort_stable(), 0);
+    }
+}
+
 #[test]
 fn test_align() {
     unsafe {