mixi大規模障害について解明編

こんにちは、システム本部技術部たんぽぽGの森本です。

先日のmixi大規模障害の原因となったmemcachedの不具合の詳細な解明ができました。
再来週まで発表を見合わせようと思ったのですが、早くお伝えしたほうがいいと思いましたので公開発表致します。

memcachedとlibevent

memcachedはlibeventというライブラリを使用してクライアントからの要求(接続、コマンド送信)を処理しています。
libeventを使用するにはevent_baseという構造体を用います。

main threadはmain_baseを使用します。

static struct event_base *main_base;
...
int main (int argc, char **argv) {
...
    main_base = event_init();
...
    /* enter the event loop */
    event_base_loop(main_base, 0);
...
}

worker threadはthread毎に自前のevent_baseを使用します。

typedef struct {
    pthread_t thread_id;        /* unique ID of this thread */
    struct event_base *base;    /* libevent handle this thread uses */
    struct event notify_event;  /* listen event for notify pipe */
    int notify_receive_fd;      /* receiving end of notify pipe */
    int notify_send_fd;         /* sending end of notify pipe */
    struct thread_stats stats;  /* Stats generated by this thread */
    struct conn_queue *new_conn_queue; /* queue of new connections to handle */
    cache_t *suffix_cache;      /* suffix cache */
} LIBEVENT_THREAD;

static void setup_thread(LIBEVENT_THREAD *me) {
    me->base = event_init();

通常の動作では、main threadとworker threadは自分専用のevent_baseにのみアクセスするので排他制御は必要ありません。
main threadはクライアントからのaccept要求を処理します。
worker threadは接続済みのクライアントからの要求(コマンド実行)を処理します。

コネクションが一杯になった時の処理

memcachedはコネクションがいっぱいになると accept_new_cons(false) を呼び出して、接続要求の受付を停止します。

static void drive_machine(conn *c) {
...
    while (!stop) {
        switch(c->state) {
        case conn_listening:
            addrlen = sizeof(addr);
            if ((sfd = accept(c->sfd, (struct sockaddr *)&addr, &addrlen)) == -1) {
                if (errno == EAGAIN || errno == EWOULDBLOCK) {
                    /* these are transient, so don't log anything */
                    stop = true;
                } else if (errno == EMFILE) {
                    if (settings.verbose > 0)
                        fprintf(stderr, "Too many open connections\n");
                    accept_new_conns(false);
                    stop = true;
                } else {
                    perror("accept()");
                    stop = true;
                }
                break;
            }
...
}

void accept_new_conns(const bool do_accept) {
    pthread_mutex_lock(&conn_lock);
    do_accept_new_conns(do_accept);
    pthread_mutex_unlock(&conn_lock);
}

void do_accept_new_conns(const bool do_accept) {
    conn *next;

    for (next = listen_conn; next; next = next->next) {
        if (do_accept) {
            update_event(next, EV_READ | EV_PERSIST);
            if (listen(next->sfd, settings.backlog) != 0) {
                perror("listen");
            }
        }
        else {
            update_event(next, 0);
            if (listen(next->sfd, 0) != 0) {
                perror("listen");
            }
        }
    }
...
}

static bool update_event(conn *c, const int new_flags) {
    assert(c != NULL);

    struct event_base *base = c->event.ev_base;
    if (c->ev_flags == new_flags)
        return true;
    if (event_del(&c->event) == -1) return false;
    event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c);
    event_base_set(base, &c->event);
    c->ev_flags = new_flags;
    if (event_add(&c->event, 0) == -1) return false;
    return true;
}

accept_new_cons(false)から最終的に呼ばれるevent_set()の第3引数は0なので、libeventはsfd(accept用ソケット)の監視を停止します。
またdo_accept_new_conns(false)でlisten(next->sfd, 0)としてソケットのlisten backlogを0にします。

コネクションがクローズされた時の処理

worker threadはクライアントとのソケットが切断されると conn_close(c) を呼び出します。

static void drive_machine(conn *c) {
...
    while (!stop) {

        switch(c->state) {
...
        case conn_closing:
            if (IS_UDP(c->transport))
                conn_cleanup(c);
            else
                conn_close(c);
            stop = true;
            break;
...
}

static void conn_close(conn *c) {
    assert(c != NULL);
...
    accept_new_conns(true);
...
}

クライアントとのソケットが切断されたということは、接続に1本余裕ができたはずです。
conn_closeからaccept_new_cons(true)を呼びだし、必要であればacceptを再開します。

void accept_new_conns(const bool do_accept) {
    pthread_mutex_lock(&conn_lock);
    do_accept_new_conns(do_accept);
    pthread_mutex_unlock(&conn_lock);
}

void do_accept_new_conns(const bool do_accept) {
    conn *next;

    for (next = listen_conn; next; next = next->next) {
        if (do_accept) {
            update_event(next, EV_READ | EV_PERSIST);
            if (listen(next->sfd, settings.backlog) != 0) {
                perror("listen");
            }
        }
        else {
            update_event(next, 0);
            if (listen(next->sfd, 0) != 0) {
                perror("listen");
            }
        }
    }
...
}

static bool update_event(conn *c, const int new_flags) {
    assert(c != NULL);

    struct event_base *base = c->event.ev_base;
    if (c->ev_flags == new_flags)
        return true;
    if (event_del(&c->event) == -1) return false;
    event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c);
    event_base_set(base, &c->event);
    c->ev_flags = new_flags;
    if (event_add(&c->event, 0) == -1) return false;
    return true;
}

コネクションが一杯になった時と同じ do_accept_new_conns(),update_event()が呼び出されますが、さきほどとはフラグの値が異なっています。
update_eventの内部では他の処理とは異なり、worker thread が main_eventに変更をくわえます。
排他制御が崩れるとしたらこの部分が怪しいのですが、accept_new_conns()でmutexを使用した排他制御を行っていますし、update_event内部でも

    if (c->ev_flags == new_flags)
        return true;
...
    c->ev_flags = new_flags;

としているので、すでにaccept再開後は以降の処理が走らないようになっています。

どこで排他制御が崩れたか

今まで見てきた流れでは

worker thread間はmutexで排他制御されている。
accept中は、c->ev_flagが立っているためworker threadは以降の処理を実行しません。
accept停止中はmain threadはaccept用ソケットの監視を停止しているので、main threadはmain_eventに変更を加えません。

ときちんと排他制御ができているように見えます。

問題はdo_accept_new_conns()にありました。

void do_accept_new_conns(const bool do_accept) {
    conn *next;

    for (next = listen_conn; next; next = next->next) {
        if (do_accept) {
            update_event(next, EV_READ | EV_PERSIST);
            if (listen(next->sfd, settings.backlog) != 0) {
                perror("listen");
            }
        }
        else {
            update_event(next, 0);
            if (listen(next->sfd, 0) != 0) {
                perror("listen");
            }
        }
    }
...
}

do_accept_new_connsはlisten_connリストの要素でループしています。
memcachedは、初期化時にgetaddrinfo()を用いてaccept待ちをすべきネットワークI/Fを取得して、listen_connを作成します。
通常でもeth0とloの2つが取得されるので、ほとんどの場合2つ以上のconnがlist_connリストに登録されている状態になります。
※-lオプションにて指定することもできますが、その場合は一つのネットワークI/Fしか使用できません。

connの定義は以下の様になっていて、event構造体を含んでいます。

typedef struct conn conn;
struct conn {
    int    sfd;
...
    struct event event;
    short  ev_flags;
...
};

update_eventの内部で c->event

    struct event_base *base = c->event.ev_base;

と event_baseを取り出していますが、do_accept_new_connsから呼ばれた場合、この値は main_baseになります。
複数のconnが同じmain_baseを指している状況です。
main_baseが一つしか無いのはしょうがないのですが、複数のconnが個別のev_flagsを持ち、ev_flagsにて処理の制御を行っているにも関わらず、対象は一つのmain_baseであることが問題を引き起こしています。

つまり、

クライアントがソケットを切断する。
worker thread(以下 W):切断を検知してconn_close(c)を実行。
W:do_accept_new_conns(true)を実行。
W:list_connの要素でループを開始。
W:一つ目のconnに対して
1. W:update_event(next, EV_READ | EV_PERSIST)を実行。
2. W:event_set(&c->event, c->sfd, new_flags, event_handler, (void *)c);を実行。
  この時点でmain threadがaccept用ソケットの監視を再開します。
  ここで排他制御が崩れました。
  これ以降、main threadはworker threadとは非同期にmain_baseに変更を加えます。
W:二つ目のconnに対して
1. W:update_event(next, EV_READ | EV_PERSIST)を実行。
2. W:if (c->ev_flags == new_flags) の判定はありますが、今回は二つ目のconnなので c->ev_flagsは0のままです。
  returnせずに次の処理に進みます。
3. W:event_add(&c->event, 0)を実行。
  event_addの内部で、処理待ちのevent数(event_count)をインクリメントしています。
  つまり、main threadが動いているにもかかわらずworker threadがmain_baseに操作を加えています。
  ここでタイミングよく(悪く?)main threadも同時にevent_addを行いevent_countのインクリメントを行うと、event_countの値が一つ少なくなってしまいます。
  処理待ちのeventがあるにも関わらず event_countが0となってしまい、main threadはevent_base_loop()を抜けてしまい、memcachedが終了します。

int
event_base_loop(struct event_base *base, int flags)
{
...
	while (!done) {
...
		/* If we have no events, we just exit */
		if (!event_haveevents(base)) {
			event_debug(("%s: no events registered.", __func__));
			return (1);
		}
...
	}

	event_debug(("%s: asked to terminate loop.", __func__));
	return (0);
}
int
event_haveevents(struct event_base *base)
{
	return (base->event_count > 0);
}

動作確認

上記の説明が正しいことを裏付けるために負荷テストを行いました。

memcachedの設定は

memcached -U 0 -u nobody -p 11222  -t 4 -m 16000 -C -c 1000 -l 192.168.0.1 -v

とネットワークI/Fを制限しました。
そこに3台のクライアントからmalaさんのテストスクリプトを用いて接続しました。

大量の”Too many open connections”を出しつづけながらも70時間(Fri 19:00-Mon 16:00と)以上落ちること無く動作しつづけました。

上記設定から-l オプションを外すと面白いぐらいにmemcachedが落ちまくります。

clock_handler

実はmemcachedにはtimerイベントも存在しており、accept中・accept停止などに関わらず常に動作しています。
低い確率ではありますが、このメソッド内でのevtimer_addによりevent_count値の不整合が起こる可能性もあります。

static void clock_handler(const int fd, const short which, void *arg) {
    struct timeval t = {.tv_sec = 1, .tv_usec = 0};
...
    evtimer_set(&clockevent, clock_handler, 0);
    event_base_set(main_base, &clockevent);
    evtimer_add(&clockevent, &t);

    set_current_time();
}