Skip to content

Commit cdebb0a

Browse files
authored
feat(state): separate read-only sqlite instance for better concurrency (#281)
c.f., mattn/go-sqlite3#1179 (comment) --------- Signed-off-by: Gyuho Lee <[email protected]>
1 parent a8f1118 commit cdebb0a

File tree

108 files changed

+700
-447
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+700
-447
lines changed

cmd/gpud/command/login.go

+13-6
Original file line numberDiff line numberDiff line change
@@ -31,26 +31,33 @@ func cmdLogin(cliContext *cli.Context) error {
3131
if err != nil {
3232
return fmt.Errorf("failed to get state file: %w", err)
3333
}
34-
db, err := sqlite.Open(stateFile)
34+
35+
dbRW, err := sqlite.Open(stateFile)
36+
if err != nil {
37+
return fmt.Errorf("failed to open state file: %w", err)
38+
}
39+
defer dbRW.Close()
40+
41+
dbRO, err := sqlite.Open(stateFile, sqlite.WithReadOnly(true))
3542
if err != nil {
3643
return fmt.Errorf("failed to open state file: %w", err)
3744
}
38-
defer db.Close()
45+
defer dbRO.Close()
3946

40-
uid, err := state.CreateMachineIDIfNotExist(rootCtx, db, "")
47+
uid, err := state.CreateMachineIDIfNotExist(rootCtx, dbRW, dbRO, "")
4148
if err != nil {
4249
return fmt.Errorf("failed to get machine uid: %w", err)
4350
}
4451

45-
components, err := state.GetComponents(rootCtx, db, uid)
52+
components, err := state.GetComponents(rootCtx, dbRO, uid)
4653
if err != nil {
4754
return fmt.Errorf("failed to get components: %w", err)
4855
}
4956

5057
cliToken := cliContext.String("token")
5158
endpoint := cliContext.String("endpoint")
5259

53-
dbToken, _ := state.GetLoginInfo(rootCtx, db, uid)
60+
dbToken, _ := state.GetLoginInfo(rootCtx, dbRO, uid)
5461
token := dbToken
5562
if cliToken != "" {
5663
token = cliToken
@@ -88,7 +95,7 @@ func cmdLogin(cliContext *cli.Context) error {
8895
}
8996

9097
if token != dbToken {
91-
if err = state.UpdateLoginInfo(rootCtx, db, uid, token); err != nil {
98+
if err = state.UpdateLoginInfo(rootCtx, dbRW, uid, token); err != nil {
9299
fmt.Println("machine logged in but failed to update token:", err)
93100
}
94101
}

cmd/gpud/command/utils.go

+8-7
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,20 @@ import (
99
"github.com/leptonai/gpud/pkg/sqlite"
1010
)
1111

12+
// GetUID returns the machine ID from the state file.
13+
// Returns an empty string and sql.ErrNoRows if the machine ID is not found.
14+
// Assumes that the state file is already opened and machine ID is already created.
1215
func GetUID(ctx context.Context) (string, error) {
1316
stateFile, err := config.DefaultStateFile()
1417
if err != nil {
1518
return "", fmt.Errorf("failed to get state file: %w", err)
1619
}
17-
db, err := sqlite.Open(stateFile)
20+
21+
dbRO, err := sqlite.Open(stateFile, sqlite.WithReadOnly(true))
1822
if err != nil {
1923
return "", fmt.Errorf("failed to open state file: %w", err)
2024
}
21-
defer db.Close()
22-
uid, err := state.CreateMachineIDIfNotExist(ctx, db, "")
23-
if err != nil {
24-
return "", fmt.Errorf("failed to get machine uid: %w", err)
25-
}
26-
return uid, nil
25+
defer dbRO.Close()
26+
27+
return state.GetMachineID(ctx, dbRO)
2728
}

components/accelerator/nvidia/bad-envs/component.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func New(ctx context.Context, cfg Config) components.Component {
1717
cfg.Query.SetDefaultsIfNotSet()
1818

1919
cctx, ccancel := context.WithCancel(ctx)
20-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
20+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2121
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name)
2222

2323
return &component{

components/accelerator/nvidia/bad-envs/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/clock-speed/component.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func New(ctx context.Context, cfg Config) components.Component {
2121
cfg.Query.SetDefaultsIfNotSet()
2222

2323
cctx, ccancel := context.WithCancel(ctx)
24-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
24+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2525
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_clock_speed_id.Name)
2626

2727
return &component{
@@ -137,7 +137,7 @@ func (c *component) Close() error {
137137

138138
var _ components.PromRegisterer = (*component)(nil)
139139

140-
func (c *component) RegisterCollectors(reg *prometheus.Registry, db *sql.DB, tableName string) error {
140+
func (c *component) RegisterCollectors(reg *prometheus.Registry, dbRW *sql.DB, dbRO *sql.DB, tableName string) error {
141141
c.gatherer = reg
142-
return nvidia_query_metrics_clockspeed.Register(reg, db, tableName)
142+
return nvidia_query_metrics_clockspeed.Register(reg, dbRW, dbRO, tableName)
143143
}

components/accelerator/nvidia/clock-speed/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/ecc/component.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func New(ctx context.Context, cfg Config) components.Component {
2121
cfg.Query.SetDefaultsIfNotSet()
2222

2323
cctx, ccancel := context.WithCancel(ctx)
24-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
24+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2525
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_ecc_id.Name)
2626

2727
return &component{
@@ -161,7 +161,7 @@ func (c *component) Close() error {
161161

162162
var _ components.PromRegisterer = (*component)(nil)
163163

164-
func (c *component) RegisterCollectors(reg *prometheus.Registry, db *sql.DB, tableName string) error {
164+
func (c *component) RegisterCollectors(reg *prometheus.Registry, dbRW *sql.DB, dbRO *sql.DB, tableName string) error {
165165
c.gatherer = reg
166-
return nvidia_query_metrics_ecc.Register(reg, db, tableName)
166+
return nvidia_query_metrics_ecc.Register(reg, dbRW, dbRO, tableName)
167167
}

components/accelerator/nvidia/ecc/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/error-xid-sxid/component.go

+4-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package errorxidsxid
33

44
import (
55
"context"
6-
"database/sql"
76
"fmt"
87
"strconv"
98
"time"
@@ -24,24 +23,24 @@ func New(ctx context.Context, cfg Config) components.Component {
2423

2524
// this starts the Xid poller via "nvml.StartDefaultInstance"
2625
cctx, ccancel := context.WithCancel(ctx)
27-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
26+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2827
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name)
2928

3029
return &component{
30+
cfg: cfg,
3131
rootCtx: ctx,
3232
cancel: ccancel,
3333
poller: nvidia_query.GetDefaultPoller(),
34-
db: cfg.Query.State.DB,
3534
}
3635
}
3736

3837
var _ components.Component = (*component)(nil)
3938

4039
type component struct {
40+
cfg Config
4141
rootCtx context.Context
4242
cancel context.CancelFunc
4343
poller query.Poller
44-
db *sql.DB
4544
}
4645

4746
func (c *component) Name() string { return nvidia_error_xid_sxid_id.Name }
@@ -61,7 +60,7 @@ const (
6160
)
6261

6362
func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
64-
events, err := nvidia_xid_sxid_state.ReadEvents(ctx, c.db, nvidia_xid_sxid_state.WithSince(since))
63+
events, err := nvidia_xid_sxid_state.ReadEvents(ctx, c.cfg.Query.State.DBRO, nvidia_xid_sxid_state.WithSince(since))
6564
if err != nil {
6665
return nil, err
6766
}

components/accelerator/nvidia/error-xid-sxid/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/error/component.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ func New(ctx context.Context, cfg Config) components.Component {
1818
cfg.Query.SetDefaultsIfNotSet()
1919

2020
cctx, ccancel := context.WithCancel(ctx)
21-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
21+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2222
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)
2323

2424
return &component{

components/accelerator/nvidia/error/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/error/xid/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/fabric-manager/component.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func New(ctx context.Context, cfg Config) (components.Component, error) {
2121
cfg.Query.SetDefaultsIfNotSet()
2222

2323
cctx, ccancel := context.WithCancel(ctx)
24-
nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DB)
24+
nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DBRW, cfg.Log.Query.State.DBRO)
2525
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)
2626

2727
if err := cfg.Log.Validate(); err != nil {

components/accelerator/nvidia/fabric-manager/component_test.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,17 @@ func TestComponentLog(t *testing.T) {
2929
ctx, cancel := context.WithCancel(context.Background())
3030
defer cancel()
3131

32-
db, err := sqlite.Open(":memory:")
32+
dbRW, err := sqlite.Open(":memory:")
3333
if err != nil {
3434
t.Fatalf("failed to open database: %v", err)
3535
}
36-
defer db.Close()
36+
defer dbRW.Close()
37+
38+
dbRO, err := sqlite.Open(":memory:", sqlite.WithReadOnly(true))
39+
if err != nil {
40+
t.Fatalf("failed to open database: %v", err)
41+
}
42+
defer dbRO.Close()
3743

3844
pollInterval := 3 * time.Second
3945
component, err := New(
@@ -43,7 +49,8 @@ func TestComponentLog(t *testing.T) {
4349
Query: query_config.Config{
4450
Interval: metav1.Duration{Duration: pollInterval},
4551
State: &query_config.State{
46-
DB: db,
52+
DBRW: dbRW,
53+
DBRO: dbRO,
4754
},
4855
},
4956
BufferSize: query_log_config.DefaultBufferSize,

components/accelerator/nvidia/fabric-manager/config.go

+7-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ type Config struct {
1717
Log query_log_config.Config `json:"log"`
1818
}
1919

20-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
20+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
2121
raw, err := json.Marshal(b)
2222
if err != nil {
2323
return nil, err
@@ -28,7 +28,12 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2828
return nil, err
2929
}
3030
if cfg.Query.State != nil {
31-
cfg.Query.State.DB = db
31+
cfg.Query.State.DBRW = dbRW
32+
cfg.Query.State.DBRO = dbRO
33+
}
34+
if cfg.Log.Query.State != nil {
35+
cfg.Log.Query.State.DBRW = dbRW
36+
cfg.Log.Query.State.DBRO = dbRO
3237
}
3338
return cfg, nil
3439
}

components/accelerator/nvidia/gpm/component.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ func (c *component) Close() error {
168168

169169
var _ components.PromRegisterer = (*component)(nil)
170170

171-
func (c *component) RegisterCollectors(reg *prometheus.Registry, db *sql.DB, tableName string) error {
171+
func (c *component) RegisterCollectors(reg *prometheus.Registry, dbRW *sql.DB, dbRO *sql.DB, tableName string) error {
172172
c.gatherer = reg
173-
return nvidia_query_metrics_gpm.Register(reg, db, tableName)
173+
return nvidia_query_metrics_gpm.Register(reg, dbRW, dbRO, tableName)
174174
}

components/accelerator/nvidia/gpm/config.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type Config struct {
1111
Query query_config.Config `json:"query"`
1212
}
1313

14-
func ParseConfig(b any, db *sql.DB) (*Config, error) {
14+
func ParseConfig(b any, dbRW *sql.DB, dbRO *sql.DB) (*Config, error) {
1515
raw, err := json.Marshal(b)
1616
if err != nil {
1717
return nil, err
@@ -22,7 +22,8 @@ func ParseConfig(b any, db *sql.DB) (*Config, error) {
2222
return nil, err
2323
}
2424
if cfg.Query.State != nil {
25-
cfg.Query.State.DB = db
25+
cfg.Query.State.DBRW = dbRW
26+
cfg.Query.State.DBRO = dbRO
2627
}
2728
return cfg, nil
2829
}

components/accelerator/nvidia/gsp-firmware-mode/component.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ func New(ctx context.Context, cfg Config) components.Component {
1717
cfg.Query.SetDefaultsIfNotSet()
1818

1919
cctx, ccancel := context.WithCancel(ctx)
20-
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
20+
nvidia_query.SetDefaultPoller(cfg.Query.State.DBRW, cfg.Query.State.DBRO)
2121
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)
2222

2323
return &component{

0 commit comments

Comments
 (0)