8000 An issue with the executor_error check being falsely positive by TheBits · Pull Request #1160 · dstackai/dstack · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

An issue with the executor_error check being falsely positive #1160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions runner/cmd/shim/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,13 @@ func writeHostInfo() {
func getGpuInfo() [][]string {
cmd := execute.ExecTask{
Command: "docker",
Args: []string{"run",
Args: []string{
"run",
"--rm",
"--gpus", "all",
"dstackai/base:py3.11-0.4rc4-cuda-12.1",
"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv"},
"nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv",
},
StreamStdio: false,
}

Expand Down
8000
28 changes: 7 additions & 21 deletions runner/consts/consts.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package consts

import "time"

const DstackDirPath string = ".dstack"

// Runner's log filenames
const RunnerDefaultLogFileName = "default.log"
const RunnerJobLogFileName = "job.log"
const RunnerLogFileName = "runner.log"
const (
RunnerDefaultLogFileName = "default.log"
RunnerJobLogFileName = "job.log"
RunnerLogFileName = "runner.log"
)

// Error-containing messages will be identified by this signature
const ExecutorFailedSignature = "Executor failed"
Expand All @@ -17,21 +17,7 @@ const HostInfoFile = "host_info.json"
// GPU constants
const NVIDIA_RUNTIME = "nvidia"

// JOB ports
const (
EXPOSE_PORT_START = 3000
EXPOSE_PORT_END = 4000
)

const MAX_ATTEMPTS = 10
const DELAY_TRY = 6 * time.Second

const DELAY_READ_STATUS = 5 * time.Second

const REPO_HTTPS_URL = "https://%s/%s/%s.git"
const REPO_GIT_URL = "git@%s:%s/%s.git"

const (
TERMINATE_POLICY = "terminate"
STOP_POLICY = "stop"
REPO_HTTPS_URL = "https://%s/%s/%s.git"
REPO_GIT_URL = "git@%s:%s/%s.git"
)
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,9 @@ def _process_pulling_with_shim(

runner_client = client.RunnerClient(port=ports[client.REMOTE_RUNNER_PORT])
resp = runner_client.healthcheck()
if resp is None or container_status.state == "pending":
if container_status.executor_error:
error_states = ("pending", "running")
if resp is None or container_status.state in error_states:
if container_status.executor_error and container_status.state in error_states:
logger.error(
"The docker container of the job '%s' stops with executor error: %s",
job_model.job_name,
Expand Down
Loading
0