@@ -314,6 +314,346 @@ if err != nil {
314
314
> 注意:
315
315
> - 根据接口文档去填写具体的访问参数,接口链接为[ 队列详情] ( https://cloud.baidu.com/doc/AIHC/s/Hm569qc26 )
316
316
317
+ ## 训练
318
+ ### 查询训练任务列表
319
+ 使用以下代码可以查询训练任务列表。
320
+ ``` go
321
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
322
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
323
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
324
+ client , _ := aihc.NewClient (ak, sk, endpoint)
325
+ req := &v1.OpenAPIJobListRequest {
326
+ ResourcePoolID: RESOURCE_POOL_ID ,
327
+ PageNo: 1 ,
328
+ PageSize: 3 ,
329
+ }
330
+ result , err := client.ListJobs (req)
331
+
332
+ if err != nil {
333
+ panic (err)
334
+ }
335
+ jsonBytes , _ := json.Marshal (result)
336
+ fmt.Println (string (jsonBytes))
337
+ ```
338
+
339
+ > 注意:
340
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务列表] ( https://cloud.baidu.com/doc/AIHC/s/rm56ipjsz )
341
+
342
+ ### 创建训练任务
343
+ 使用以下代码可以创建训练任务。
344
+ ``` go
345
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
346
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
347
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
348
+ resourcePoolID := RESOURCE_POOL_ID
349
+
350
+ jobConfig := &v1.OpenAPIJobCreateRequest {
351
+ Name: AIJobName ,
352
+ JobSpec: v1.OpenAPIAIJobSpec {
353
+ Command: ` echo "hello sdk"; sleep infinity` ,
354
+ Replicas: 1 ,
355
+ Image: ImageID,
356
+ Resources: []v1.OpenAPIResource {
357
+ {
358
+ Name: " cpu" ,
359
+ Quantity: 1 ,
360
+ },
361
+ },
362
+ EnableRDMA: false ,
363
+ },
364
+ EnableBccl: false ,
365
+ }
366
+ client , _ := aihc.NewClient (ak, sk, endpoint)
367
+ result , err := client.CreateJob (jobConfig, resourcePoolID)
368
+
369
+ if err != nil {
370
+ panic (err)
371
+ }
372
+
373
+ jsonBytes , _ := json.Marshal (result)
374
+ fmt.Println (string (jsonBytes))
375
+ ```
376
+
377
+ > 注意:
378
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 创建训练任务] ( https://cloud.baidu.com/doc/AIHC/s/jm56inxn7 )
379
+
380
+
381
+ ### 查询训练任务详情
382
+ 使用以下代码可以查询训练任务详情。
383
+ ``` go
384
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
385
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
386
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
387
+ resourcePoolID , JobID := RESOURCE_POOL_ID , AIJobID
388
+
389
+ client , _ := aihc.NewClient (ak, sk, endpoint)
390
+ result , err := client.GetJob (JobID, resourcePoolID)
391
+
392
+ if err != nil {
393
+ panic (err)
394
+ }
395
+
396
+ jsonBytes , _ := json.Marshal (result)
397
+ fmt.Println (string (jsonBytes))
398
+ ```
399
+
400
+ > 注意:
401
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务详情] ( https://cloud.baidu.com/doc/AIHC/s/Cm56ir6ui )
402
+
403
+
404
+ ### 更新训练任务
405
+ 使用以下代码可以更新训练任务。
406
+ ``` go
407
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
408
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
409
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
410
+ resourcePoolID := RESOURCE_POOL_ID
411
+ jobID := AIJobID
412
+
413
+ jobConfig := &v1.OpenAPIJobUpdateRequest {
414
+ Priority: " high" ,
415
+ }
416
+ client , _ := aihc.NewClient (ak, sk, endpoint)
417
+ result , err := client.UpdateJob (jobConfig, jobID, resourcePoolID)
418
+
419
+ if err != nil {
420
+ panic (err)
421
+ }
422
+ jsonBytes , _ := json.Marshal (result)
423
+ fmt.Println (string (jsonBytes))
424
+ ```
425
+
426
+ > 注意:
427
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 更新训练任务] ( https://cloud.baidu.com/doc/AIHC/s/um56issf8 )
428
+
429
+ ### 停止训练任务
430
+ 使用以下代码可以停止训练任务。
431
+ ``` go
432
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
433
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
434
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
435
+ resourcePoolID := RESOURCE_POOL_ID
436
+ jobID := AIJobID
437
+
438
+ client , _ := aihc.NewClient (ak, sk, endpoint)
439
+ result , err := client.StopJob (jobID, resourcePoolID)
440
+ log.Infof (" stop job result: %v " , result)
441
+ if err != nil {
442
+ panic (err)
443
+ }
444
+ jsonBytes , _ := json.Marshal (result)
445
+ fmt.Println (string (jsonBytes))
446
+ ```
447
+
448
+ > 注意:
449
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 停止训练任务] ( https://cloud.baidu.com/doc/AIHC/s/hm56izyfa )
450
+
451
+
452
+
453
+ ### 删除训练任务
454
+ 使用以下代码可以删除训练任务。
455
+ ``` go
456
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
457
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
458
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
459
+ resourcePoolID , JobID := RESOURCE_POOL_ID , AIJobID
460
+
461
+ client , _ := aihc.NewClient (ak, sk, endpoint)
462
+ result , err := client.DeleteJob (JobID, resourcePoolID)
463
+
464
+ if err != nil {
465
+ panic (err)
466
+ }
467
+
468
+ jsonBytes , _ := json.Marshal (result)
469
+ fmt.Println (string (jsonBytes))
470
+ ```
471
+
472
+ > 注意:
473
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 删除训练任务] ( https://cloud.baidu.com/doc/AIHC/s/Sm56iuodq )
474
+
475
+
476
+
477
+ ### 查询训练任务事件
478
+ 使用以下代码可以查询训练任务事件。
479
+ ``` go
480
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
481
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
482
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
483
+
484
+ req := &v1.GetJobEventsRequest {
485
+ Namespace: " " ,
486
+ JobFramework: " PyTorchJob" ,
487
+ StartTime: " " ,
488
+ EndTime: " " ,
489
+ JobID: AIJobID ,
490
+ ResourcePoolID: RESOURCE_POOL_ID ,
491
+ }
492
+
493
+ client , _ := aihc.NewClient (ak, sk, endpoint)
494
+ result , err := client.GetTaskEvent (req)
495
+
496
+ if err != nil {
497
+ panic (err)
498
+ }
499
+ jsonBytes , _ := json.Marshal (result)
500
+ fmt.Println (string (jsonBytes))
501
+ ```
502
+
503
+ > 注意:
504
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务事件] ( https://cloud.baidu.com/doc/AIHC/s/Km56iw5oj )
505
+
506
+
507
+ ### 查询训练任务日志
508
+ 使用以下代码可以查询训练任务日志。
509
+ ``` go
510
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
511
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
512
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
513
+
514
+ req := &v1.GetPodLogsRequest {
515
+ JobID: AIJobID ,
516
+ ResourcePoolID: RESOURCE_POOL_ID ,
517
+ PodName: PodName ,
518
+ Namespace: " default" ,
519
+ StartTime: " " ,
520
+ EndTime: " " ,
521
+ MaxLines: " " ,
522
+ Container: " " ,
523
+ Chunk: " " ,
524
+ }
525
+
526
+ client , _ := aihc.NewClient (ak, sk, endpoint)
527
+ result , err := client.GetPodLogs (req)
528
+
529
+ if err != nil {
530
+ panic (err)
531
+ }
532
+ jsonBytes , _ := json.Marshal (result)
533
+ fmt.Println (string (jsonBytes))
534
+ ```
535
+
536
+ > 注意:
537
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务日志] ( https://cloud.baidu.com/doc/AIHC/s/wm56ixjus )
538
+
539
+
540
+ ### 查询训练任务Pod事件
541
+ 使用以下代码可以查询训练任务Pod事件。
542
+ ``` go
543
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
544
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
545
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
546
+ req := &v1.GetPodEventsRequest {
547
+ JobID: AIJobID ,
548
+ ResourcePoolID: RESOURCE_POOL_ID ,
549
+ Namespace: " " ,
550
+ JobFramework: " PyTorchJob" ,
551
+ StartTime: " " ,
552
+ EndTime: " " ,
553
+ PodName: PodName ,
554
+ }
555
+
556
+ client , _ := aihc.NewClient (ak, sk, endpoint)
557
+ result , err := client.GetPodEvents (req)
558
+
559
+ if err != nil {
560
+ panic (err)
561
+ }
562
+ jsonBytes , _ := json.Marshal (result)
563
+ fmt.Println (string (jsonBytes))
564
+ ```
565
+
566
+ > 注意:
567
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务Pod事件] ( https://cloud.baidu.com/doc/AIHC/s/vm56iypch )
568
+
569
+
570
+
571
+ ### 查询训练任务监控
572
+ 使用以下代码可以查询训练任务监控。
573
+ ``` go
574
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
575
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
576
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
577
+ req := &v1.GetTaskMetricsRequest {
578
+ StartTime: " " ,
579
+ ResourcePoolID: RESOURCE_POOL_ID ,
580
+ EndTime: " " ,
581
+ TimeStep: " " ,
582
+ MetricType: MetricType ,
583
+ JobID: AIJobID ,
584
+ Namespace: " " ,
585
+ RateInterval: " " ,
586
+ }
587
+
588
+ client , _ := aihc.NewClient (ak, sk, endpoint)
589
+ result , err := client.GetTaskMetrics (req)
590
+
591
+ if err != nil {
592
+ panic (err)
593
+ }
594
+ jsonBytes , _ := json.Marshal (result)
595
+ fmt.Println (string (jsonBytes))
596
+ ```
597
+
598
+ > 注意:
599
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务监控] ( https://cloud.baidu.com/doc/AIHC/s/Um56j1bo8 )
600
+
601
+
602
+ ### 查询训练任务所在节点列表
603
+ 使用以下代码可以查询训练任务所在节点列表。
604
+ ``` go
605
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
606
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
607
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
608
+ resourcePoolID := RESOURCE_POOL_ID
609
+ jobID := AIJobID
610
+ namespace := " "
611
+
612
+ client , _ := aihc.NewClient (ak, sk, endpoint)
613
+ result , err := client.GetJobNodesList (jobID, resourcePoolID, namespace)
614
+
615
+ if err != nil {
616
+ panic (err)
617
+ }
618
+ jsonBytes , _ := json.Marshal (result)
619
+ fmt.Println (string (jsonBytes))
620
+ ```
621
+
622
+ > 注意:
623
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 查询训练任务所在节点列表] ( https://cloud.baidu.com/doc/AIHC/s/Mm56j2j2c )
624
+
625
+
626
+
627
+ ### 获取训练任务WebTerminal地址
628
+ 使用以下代码可以获取训练任务WebTerminal地址。
629
+ ``` go
630
+ // import "github.com/baidubce/bce-sdk-go/services/aihc"
631
+ // import "github.com/baidubce/bce-sdk-go/services/aihc/api/v1"
632
+ ak , sk , endpoint := ak_test, sk_test, endpoint_test
633
+
634
+ req := &v1.GetWebShellURLRequest {
635
+ JobID: AIJobID ,
636
+ ResourcePoolID: RESOURCE_POOL_ID ,
637
+ PodName: PodName ,
638
+ Namespace: " " ,
639
+ PingTimeoutSecond: " " ,
640
+ HandshakeTimeoutSecond: " " ,
641
+ }
642
+
643
+ client , _ := aihc.NewClient (ak, sk, endpoint)
644
+ result , err := client.GetWebSSHUrl (req)
645
+
646
+ if err != nil {
647
+ panic (err)
648
+ }
649
+ jsonBytes , _ := json.Marshal (result)
650
+ fmt.Println (string (jsonBytes))
651
+ ```
652
+
653
+ > 注意:
654
+ > - 根据接口文档去填写具体的访问参数,接口链接为[ 获取训练任务WebTerminal地址] ( https://cloud.baidu.com/doc/AIHC/s/Im56j3op4 )
655
+
656
+
317
657
## 自定义部署
318
658
### 创建服务
319
659
使用以下代码可以创建服务。
0 commit comments