首页 > 代码库 > Parrot源码分析之海贼王

Parrot源码分析之海贼王

speedup-example.cpp

/* Copyright (c) 2013,  Regents of the Columbia University 
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other 
 * materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
//#include "tern/user.h"

#define N 4
//#define M 30000
#define M 30000*10

//int nwait = 0;
volatile long long sum;
volatile long long sum_2;
long loops = 6e3;
pthread_mutex_t mutex;
pthread_cond_t cond;
pthread_barrier_t bar;

void set_affinity(int core_id) {
  cpu_set_t cpuset;
  CPU_ZERO(&cpuset);
  CPU_SET(core_id, &cpuset);
  assert(pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) == 0);
}

void* thread_func(void *arg) {
  set_affinity((int)(long)arg);
  for (int j = 0; j < M; j++) {
    //pthread_mutex_lock(&mutex);
    //nwait++;
    for (long i = 0; i < loops; i++) // This is the key of speedup for parrot: the mutex needs to be a little bit congested.
    {  
	sum += i;
	sum_2 += i;
    }
    //pthread_cond_wait(&cond, &mutex);

    //printf("being in the lock is: %lu\n", pthread_self());

    //pthread_mutex_unlock(&mutex);
    //soba_wait(0);
    //pthread_barrier_wait(&bar);
    for (long i = 0; i < loops; i++)
    {	
	sum += i*i*i*i*i*i;
    }
    //fprintf(stderr, "compute thread %u %d\n", (unsigned)thread, sched_getcpu());
  }
}

int main(int argc, char *argv[]) {
  set_affinity(23);
  //soba_init(0, N, 20);
  pthread_t th[N];
  int ret;
  //pthread_cond_init(&cond, NULL);
  //pthread_barrier_init(&bar, NULL, N);

  for(unsigned i=0; i<N; ++i) {
    ret  = pthread_create(&th[i], NULL, thread_func, (void*)i);
    assert(!ret && "pthread_create() failed!");
  }

  /*for (int j = 0; j < M; j++) {
    while (nwait < N) {
      sched_yield();
    }
    pthread_mutex_lock(&mutex);
    nwait = 0;
    //fprintf(stderr, "broadcast %u %d\n", (unsigned)pthread_self(), sched_getcpu());
    pthread_cond_broadcast(&cond);
    pthread_mutex_unlock(&mutex);
  }
  */	


  for(unsigned i=0; i<N; ++i)
    pthread_join(th[i], NULL);

  // test the validity of results
  printf("sum_2 = %lld\n", sum_2);
  //printf("sum = %lld\n", sum);

  exit(0);
}


上面是我们整个旅程的路线,下面是我们的始发地。


xtern/lib/runtime/record-scheduler.cpp

void RRScheduler::getTurn()
{
  int tid = self();
  assert(tid>=0 && tid < Scheduler::nthread);
  waits[tid].wait();
  dprintf("RRScheduler: %d gets turn\n", self());
  SELFCHECK;
}

self()返回当前线程的tid,来自xtern/include/tern/runtime/scheduler.h中的struct TidMap:

  /// tern tid for current thread
  static int self() { return self_tid; }

RRScheduler::getTurn是RRScheduler::wait_t::wait的唯一入口。

getTurn的作用是让一个线程运行,而让其他线程等待,实现串行化。

实际上getTurn最早见于struct Serializer,所以这里引出一些重要类的继承关系。


接下来要弄明白是哪里调用了getTurn,以及它在RR调度中起的作用。

在34次直接调用getTurn()的共有五处:

  • xtern/lib/runtime/record-runtime.cpp: line 337: SCHED_TIMER_START(共调用30次)
  • xtern/lib/runtime/record-runtime.cpp: line 272: RecorderRT<_S>::idle_sleep (共调用1次)
  • xtern/lib/runtime/record-scheduler.cpp: line 372: RRScheduler::wait(共调用1次)
  • xtern/lib/runtime/record-runtime.cpp: line 376: RecorderRT<_S>::printStat (共调用1次)
  • xtern/lib/runtime/record-runtime.cpp: line 286: RecorderRT<_S>::idle_cond_wait(共调用1次)

而调用SCHED_TIMER_START的函数有(它们都在xtern/lib/runtime/record-runtime.cpp):

  • RecorderRT<_S>::threadBegin(void)
  • RecorderRT<_S>::threadEnd(unsigned ins)
  • RecorderRT<_S>::pthreadCreate
  • RecorderRT<_S>::pthreadJoin
  • RecorderRT<_S>::pthreadMutexInit
  • RecorderRT<_S>::pthreadMutexDestroy
  • RecorderRT<_S>::pthreadMutexLock
  • RecorderRT<_S>::__pthread_rwlock_rdlock
  • RecorderRT<_S>::__pthread_rwlock_wrlock
  • RecorderRT<_S>::__pthread_rwlock_tryrdlock
  • RecorderRT<_S>::__pthread_rwlock_trywrlock
  • RecorderRT<_S>::__pthread_rwlock_unlock
  • RecorderRT<_S>::__pthread_rwlock_destroy
  • RecorderRT<_S>::__pthread_rwlock_init
  • RecorderRT<_S>::pthreadMutexTryLock
  • RecorderRT<_S>::pthreadMutexTimedLock
  • RecorderRT<_S>::pthreadMutexUnlock
  • RecorderRT<_S>::pthreadBarrierInit
  • RecorderRT<_S>::pthreadBarrierWait
  • RecorderRT<_S>::pthreadBarrierDestroy
  • RecorderRT<_S>::pthreadCondWait
  • RecorderRT<_S>::pthreadCondTimedWait
  • RecorderRT<_S>::pthreadCondSignal
  • RecorderRT<_S>::pthreadCondBroadcast
  • RecorderRT<_S>::semWait
  • RecorderRT<_S>::semTryWait
  • RecorderRT<_S>::semTimedWait
  • RecorderRT<_S>::semPost
  • RecorderRT<_S>::semInit
  • RecorderRT<_S>::lineupInit
  • RecorderRT<_S>::lineupDestroy
  • RecorderRT<_S>::lineupStart
  • RecorderRT<_S>::lineupEnd
  • RecorderRT<_S>::nonDetStart()
  • RecorderRT<_S>::symbolic
  • RecorderRT<RecordSerializer>::pthreadBarrierWait
  • RecorderRT<RecordSerializer>::pthreadCondWait
  • RecorderRT<RecordSerializer>::pthreadCondTimedWait
  • RecorderRT<RecordSerializer>::pthreadCondSignal
  • RecorderRT<RecordSerializer>::pthreadCondBroadcast
  • RecorderRT<_S>::__fork
  • RecorderRT<_S>::__execv
  • RecorderRT<_S>::schedYield
  • RecorderRT<_S>::__sleep
  • RecorderRT<_S>::__usleep
  • RecorderRT<_S>::__nanosleep
我们需要确定以上函数到底哪些被调用。如果一个一个函数去测,显得笨重而且容易出错,所以我打算从运行源头看整个调用链。

第一回

首先从“东海颠倒山”__libc_start_main(xtern/dync_hook/spec_hooks.cpp: line 73)开始我们的旅行(还是寻宝比较给劲儿^-^)。通过“永久指针”printf我们来到了“伟大航路”上的 __tern_prog_begin(xtern/lib/runtime/helper.cpp: line  123)岛。让我们在它上面寻找我们想要的东西吧。果然有宝物tern::InstallRuntime(),顺着它我们找了对于我们这个大航海至关重要的伙伴Runtime::the = new RecorderRT<RRScheduler>(xtern/lib/runtime/record-runtime.cpp: line 183)。伙伴the是师承东海的Runtime和王下七武海的RRScheduler的一名剑客(RecorderRT)。
下一站在the的指引下我们来到了the的老师身为王下七武海的RRScheduler的领地,我们要向他请求一样东西。可是RRScheduler并没有,不过在他的帮助下我们通过他师父(Scheduler)的师父(Serializer)的师父(TidMap),锁定了宝物的位置。它藏在xtern/include/tern/runtime/scheduler.h中TidMap(pthread_t main_th) { init(main_th); }中的init里。果然我们终于在init里面找到了宝物create()。

宝物create告诉我们,之前我们找到的宝物tern::InstallRuntime()需要和一件宝物相接,而这件宝物应该在我们发现tern::InstallRuntime()的地方的附近。没办法,只好返回 __tern_prog_begin(xtern/lib/runtime/helper.cpp: line  123)岛。在printf("liuqiushan8019\n")和printf("liuqiushan8020\n")的夹逼下,我们将地点锁定到了tern_thread_begin(); // main thread begins。这时the使出了大招百八十烦恼风Runtime::the->threadBegin(),击败敌人后,终于夺得了宝物SCHED_TIMER_START。


第二回

上回说到我们得到一个宝物叫create(),它由一个兄弟,我们顺着这条线:__libc_start_main->__tern_prog_begin->tern_pthread_create,发现在tern_pthread_create中有一个岛屿叫Runtime::the->pthreadCreate,在它上面果然又有一个create(),但目前我们不知道它的作用,只知道它印有idle。这让我想探究一下create()的家室,看看它究竟还有多少个兄弟,每个create()又肩负着怎样的使命。
现在我们要弄明白的是创建的四个子线程的执行顺序是什么样的,是一个线程先完成任务,再让另一个线程执行,还是A-B-C-D这样不断循环,最终近乎一块结束?在有LD_PRELOAD时(即使用Parrot),创建四个子线程,M=30000,loops=6e3,奇妙的现象出现了:当有pthread_mutex_lock(&mutex)和pthread_mutex_unlock(&mutex)时,四个子线程的执行顺序是A-B-C-D循环M次;当没有mutex时,四个子线程的执行顺序是A执行完自己所有的任务,然后B开始执行,再C,再D,而且没有mutex不影响结果的准确。
让我们请出大神VTune,帮助我们分析一下这两种顺序的差异和原因。这是我们大航海面临的第一个强敌,不可大意,在此我下达船长命令:全体船员准备战斗!

有mutex:



无mutex:




接下来让我们从源码的角度看一看这四个子线程是如何被创建出来的。在上面寻找应有idle的create时,我们发现了tern_pthread_create,我猜想它就是创建idle和其他四个子线程的入口。奇怪的是那里调用了tern_pthread_create?PARROT中只有两处调用了tern_pthread_create,但经鉴定都不是,其中一处位于xtern\lib\runtime\record-runtime.cpp: line 2183,它在整个过程中未执行;另一处位于xtern\lib\runtime\helper.cpp: line 152,它是用来创建一开始的idle线程的,我们将它注掉,并不影响speedup-example的运行。所以我们大胆猜想:根据LD_PRELOAD的原理,在speedup-example.cpp中我们使用了pthread_create,虽然在Parrot中在xtern\eval\rand-intercept\rand-intercept.c: line 251中有同名函数pthread_create,但它并未被使用,所以我们猜想是tern_pthread_create替换了speedup-example.cpp中的pthread_create。
这样我们大致弄清楚了程序一开始到创建四个子线程这段过程中究竟Parrot在干什么——

(A)程序开始执行到创建四个子线程之前




简单地说就是初始化了Runtime::the,然后main thread begins,再然后创建了一个idle线程,目前还不知道其作用。

(B)利用tern_pthread_create创建四个子线程。

在第三回中我们来探索一下tern_pthread_create都干了些什么。


第三回


上回说到我们遇到了第一个强敌“两种顺序”,在首回交锋中,他使出了大招“用tern_pthread_create来创建子线程”。接下来看我们如何破。



这就是大招的原理分解图。在分析RecorderRT<_S>::pthreadCreate时,发现有提示:

/// The pthread_create wrapper solves three problems.
///
/// Problem 1.  We must assign a logical tern tid to the new thread while
/// holding turn, or multiple newly created thread could get their logical
/// tids nondeterministically.  To do so, we assign a logical tid to a new
/// thread in the thread that creates the new thread.
///
/// If we were to assign this logical id in the new thread itself, we may
/// run into nondeterministic runs, as illustrated by the following
/// example
///
///       t0        t1           t2            t3
///    getTurn();
///    create t2
///    putTurn();
///               getTurn();
///               create t3
///               putTurn();
///                                         getTurn();
///                                         get turn tid 2
///                                         putTurn();
///                            getTurn();
///                            get turn tid 3
///                            putTurn();
///
/// in a different run, t1 may run first and get turn tid 2.
///
/// Problem 2.  When a child thread is created, the child thread may run
/// into a getTurn() before the parent thread has assigned a logical tid
/// to the child thread.  This causes getTurn to refer to self_tid, which
/// is undefined.  To solve this problem, we use @thread_begin_sem to
/// create a thread in suspended mode, until the parent thread has
/// assigned a logical tid for the child thread.
///
/// Problem 3.  We can use one semaphore to create a child thread
/// suspended.  However, when there are two pthread_create() calls, we may
/// still run into cases where a child thread tries to get its
/// thread-local tid but gets -1.  consider
///
///       t0        t1           t2            t3
///    getTurn();
///    create t2
///    sem_post(&thread_begin_sem)
///    putTurn();
///               getTurn();
///               create t3
///                                         sem_wait(&thread_begin_sem);
///                                         self_tid = TidMap[pthread_self()]
///               sem_post(&thread_begin_sem)
///               putTurn();
///                                         getTurn();
///                                         get turn tid 2
///                                         putTurn();
///                            sem_wait(&thread_begin_sem);
///                            self_tid = TidMap[pthread_self()]
///                            getTurn();
///                            get turn tid 3
///                            putTurn();
///
/// The crux of the problem is that multiple sem_post can pair up with
/// multiple sem_down in different ways.  We solve this problem using
/// another semaphore, thread_begin_done_sem.
///

顿时剑法的一些关键招数四处浮现在眼前:pthread_create wrapper,assign a logical tid to a new thread,holding turn,getTurn...

看来是上乘剑法。


有了招数分解图和一部分剑谱还是费解,那么先来看一下我们的伙伴厨师top探得的消息:


情景一:./speedup-example(有mutex,四个子线程,M=30000*10,没使用Parrot)



情景二:./run_example_tern(有mutex,四个子线程,M=30000*10,使用Parrot)



情景三:./run_example_tern(没有mutex,四个子线程,M=30000*10,使用Parrot)



通过top,我们发现——

  • 情景二和情景三相比于情景一,id列(绿色部分)为0.0%,而情景一的id列则占据了较大的百分比。
  • 情景三较之于情景二,sy列(橙色部分)为0.0%,而情景二的sy列占据了近乎一半。
  • 情景一和情景二的四个core(Cpu0,1,2,3)都是一起运行,而情景三是一个接着一个运行(Cpu0->Cpu1->Cpu2->Cpu3)。

这让我们想探究一下在情景二和情景三下四个子线程的创建顺序是什么样的。先设想一下:情景二是四个子线程近乎同时创建,而情景二是一开始先近乎同时创建两个子线程,此后每当一个子线程完成自己的任务结束时再创建一个新的子线程。


情景四:./run_example_tern(有mutex,八个子线程,M=30000*10,使用Parrot)



情景五:./run_example_tern(没有mutex,八个子线程,M=30000*10,使用Parrot)








Parrot源码分析之海贼王