我如何使用多线程C#实现筛埃拉托色尼? [英] How do I implement the Sieve Of Eratosthenes using multithreaded C#?

查看:119
本文介绍了我如何使用多线程C#实现筛埃拉托色尼?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想筛埃拉托色尼使用Mutithreading实现。下面是我实现的:

I am trying to implement Sieve Of Eratosthenes using Mutithreading. Here is my implementation:

using System;
using System.Collections.Generic;
using System.Threading;

namespace Sieve_Of_Eratosthenes 
{
    class Controller 
        {
        public static int upperLimit = 1000000000;
        public static bool[] primeArray = new bool[upperLimit];

        static void Main(string[] args) 
        {
        DateTime startTime = DateTime.Now;

        Initialize initial1 = new Initialize(0, 249999999);
        Initialize initial2 = new Initialize(250000000, 499999999);
        Initialize initial3 = new Initialize(500000000, 749999999);
        Initialize initial4 = new Initialize(750000000, 999999999);

        initial1.thread.Join();
        initial2.thread.Join();
        initial3.thread.Join();
        initial4.thread.Join();

        int sqrtLimit = (int)Math.Sqrt(upperLimit);

        Sieve sieve1 = new Sieve(249999999);
        Sieve sieve2 = new Sieve(499999999);
        Sieve sieve3 = new Sieve(749999999);
        Sieve sieve4 = new Sieve(999999999);

        for (int i = 3; i < sqrtLimit; i += 2) 
            {
            if (primeArray[i] == true) 
                {
                int squareI = i * i;

                    if (squareI <= 249999999) 
                    {
                sieve1.set(i);
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve1.thread.Join();
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 249999999 & squareI <= 499999999) 
                    {
                sieve2.set(i);
                sieve3.set(i);
                sieve4.set(i);
                sieve2.thread.Join();
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 499999999 & squareI <= 749999999) 
                    {
                sieve3.set(i);
                sieve4.set(i);
                sieve3.thread.Join();
                sieve4.thread.Join();
            } 
                    else if (squareI > 749999999 & squareI <= 999999999) 
                    {
                sieve4.set(i);
                sieve4.thread.Join();
            }
            }
        }    

        int count = 0;
        primeArray[2] = true;
        for (int i = 2; i < upperLimit; i++) 
            {
            if (primeArray[i]) 
                {
                count++;
            }
        }

        Console.WriteLine("Total: " + count);

        DateTime endTime = DateTime.Now;
        TimeSpan elapsedTime = endTime - startTime;
        Console.WriteLine("Elapsed time: " + elapsedTime.Seconds);
        }

        public class Initialize 
        {
            public Thread thread;
        private int lowerLimit;
        private int upperLimit;

        public Initialize(int lowerLimit, int upperLimit) 
            {
            this.lowerLimit = lowerLimit;
            this.upperLimit = upperLimit;
            thread = new Thread(this.InitializeArray);
            thread.Priority = ThreadPriority.Highest;
            thread.Start();
        }

        private void InitializeArray() 
            {
            for (int i = this.lowerLimit; i <= this.upperLimit; i++) 
                {
                if (i % 2 == 0) 
                    {
                    Controller.primeArray[i] = false;
            } 
                    else 
                    {
                Controller.primeArray[i] = true;
            }
            }
        }
        }

        public class Sieve 
            {
            public Thread thread;
            public int i;
            private int upperLimit;

            public Sieve(int upperLimit) 
                {
                this.upperLimit = upperLimit;
            }

        public void set(int i) 
            {
            this.i = i;
            thread = new Thread(this.primeGen);
            thread.Start();
        }

        public void primeGen() 
            {
            for (int j = this.i * this.i; j <= this.upperLimit; j += i) 
                {
                Controller.primeArray[j] = false;
            }
        }
        }
    }
}

这需要30秒产生输出,是有什么办法可以加快这?

This takes 30 seconds to produce the output, is there any way to speed this up?

编辑:
下面是TPL实现:

Here is the TPL implementation:

public LinkedList<int> GetPrimeList(int limit) {
        LinkedList<int> primeList = new LinkedList<int>();
        bool[] primeArray = new bool[limit];

        Console.WriteLine("Initialization started...");

        Parallel.For(0, limit, i => {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }
        );
        Console.WriteLine("Initialization finished...");

        /*for (int i = 0; i < limit; i++) {
            if (i % 2 == 0) {
                primeArray[i] = false;
            } else {
                primeArray[i] = true;
            }
        }*/

        int sqrtLimit = (int)Math.Sqrt(limit);
        Console.WriteLine("Operation started...");
        Parallel.For(3, sqrtLimit, i => {
            lock (this) {
                if (primeArray[i]) {
                    for (int j = i * i; j < limit; j += i) {
                        primeArray[j] = false;
                    }

                }
            }
        }
        );
        Console.WriteLine("Operation finished...");
        /*for (int i = 3; i < sqrtLimit; i += 2) {
            if (primeArray[i]) {
                for (int j = i * i; j < limit; j += i) {
                    primeArray[j] = false;
                }
            }
        }*/

        //primeList.AddLast(2);
        int count = 1;
        Console.WriteLine("Counting started...");
        Parallel.For(3, limit, i => {
            lock (this) {
                if (primeArray[i]) {
                    //primeList.AddLast(i);
                    count++;
                }
            }
        }
        );
        Console.WriteLine("Counting finished...");
        Console.WriteLine(count);

        /*for (int i = 3; i < limit; i++) {
            if (primeArray[i]) {
                primeList.AddLast(i);
            }
        }*/

        return primeList;
    }

感谢您。

推荐答案

编辑:

我的问题的答案是:是的,你绝对可以使用任务并行库(TPL)找到素数一十亿更快。因为它不能有效地使用存储器或多处理,并且最终输出也是没有效率定的code(S)中的问题是,慢

My answer to the question is: Yes, you can definitely use the Task Parallel Library (TPL) to find the primes to one billion faster. The given code(s) in the question is slow because it isn't efficiently using memory or multiprocessing, and final output also isn't efficient.

所以其他不仅仅是多处理后,有大量的事情可以做,以加快Eratosthenese的筛号,如下所示:

So other than just multiprocessing, there are a huge number of things you can do to speed up the Sieve of Eratosthenese, as follows:


  1. 您筛所有数字,奇数和偶数,它们都使用更多的内存
    (对于你的One十亿范围十亿字节),并是由于慢
    到不必要的处理。只是使用的事实,二是
    只有偶素所以使阵列重新present只有奇素数会
    一半的存储器要求,降低的复合数
    通过在两个因素使操作数剔除操作
    可能采取类似20秒你的机器上的素数到
    十亿。

  2. 的是合数超过扑杀如此巨大的原因之一
    存储器阵列是如此之慢是它大大超过了CPU高速缓存
    大小,从而使许多存储器访问是到主存储器中稍微
    随机的方式即扑杀给定的合数
    再presentation可坐百余CPU时钟周期,而如果
    他们都在L1高速缓存将只需要一个周期,并在
    二级缓存只有约四个周期;不是所有的访问拿最差
    例次,但这绝对减慢了处理。使用位
    包装阵列重新present总理候选人将减少使用
    由八个系数存储器和使最坏的情况下访问少
    共同。而会有一个计算开销来访问
    各个位,你会发现有净收益为时间
    保存在降低平均存储器存取时间会大于
    这笔费用。实现这个的简单的方法是使用一个BitArray
    而不是布尔数组。编写自己的位访问使用
    转变与和操作会比利用更高效
    BitArray类。你会发现使用BitArray和轻微节约
    两另一个因素做自己的位运算单
    也许大约十或12秒线程性能与
    这一变化。

  3. 您发现素数的计数的输出是不是很有效
    如果需要每位考生首要条件数组访问和。
    一旦你的筛缓冲区数组紧缩字阵
    位,你可以用一个计数更有效地做到这一点仰望
    表(LUT),它消除了如果条件和仅需要两个
    数组访问的每比特紧缩字。这样,时间
    相比于时间计数变为工作的一个可忽略的部分
    扑杀合数,为进一步节省踏踏实实地
    如八个秒的素数的计数一十亿。

  4. 在处理的主要候选者的数量进一步减少能
    是施加车轮分解的结果,这消除说
    素数2,3,和5从处理和由多种因素
    调节位打包的方法还可以增加有效
    范围由另一约两倍的一个给定的尺寸比特缓冲器的。
    这可以通过减少复合数扑杀操作的数目
    高达三倍以上的另一个重要因素,尽管成本
    进一步的计算复杂性。

  5. 在为了进一步降低内存的使用,使内存访问甚至
    更高效,$ P $每页多处理的 pparing方式
    ,可以分工成没有更大的页面
    比L1或L2高速缓存大小。这需要一个保持基地
    素数的所有素数的表达最大的平方根
    总理候选人,并重新计算的起始地址参数
    每跨越一个给定的页面段扑杀使用的基本素数,
    但是这仍然比使用巨大剔除数组更有效率。一个
    额外的好处这个网页分割是一个再
    不必预先指定的上筛分限制,但
    而可以只延长在必要时进一步上基本素数
    页处理。与所有的优化,这一点,
    你可以很可能产生质数的计数达到一十亿
    约2.5秒。

  6. 最后,我们可以把最后的润色在多重页面
    使用TPL或线程,其中使用的一个缓冲区大小段
    (每核心)的L2高速缓存的大小会产生的加成增益
    两个在你的双核超非螺纹(HT)旧的因素
    处理器作为英特尔的Core 2 Duo E7500为执行时间找到
    素数的一个十亿的约1.25秒左右的数量。

  1. You sieve all numbers, even and odd, which both uses more memory (one billion bytes for your range of one billion) and is slower due to the unnecessary processing. Just using the fact that two is the only even prime so making the array represent only odd primes would half the memory requirements and reduce the number of composite number cull operations by over a factor of two so that the operation might take something like 20 seconds on your machine for primes to a billion.
  2. Part of the reason that composite number culling over such a huge memory array is so slow is that it greatly exceeds the CPU cache sizes so that many memory accesses are to main memory in a somewhat random fashion meaning that culling a given composite number representation can take over a hundred CPU clock cycles, whereas if they were all in the L1 cache it would only take one cycle and in the L2 cache only about four cycles; not all accesses take the worst case times, but this definitely slows the processing. Using a bit packed array to represent the prime candidates will reduce the use of memory by a factor of eight and make the worst case accesses less common. While there will be a computational overhead to accessing individual bits, you will find there is a net gain as the time saving in reducing average memory access time will be greater than this cost. The simple way to implement this is to use a BitArray rather than an array of bool. Writing your own bit accesses using shift and "and" operations will be more efficient than use of the BitArray class. You will find a slight saving using BitArray and another factor of two doing your own bit operations for a single threaded performance of perhaps about ten or twelve seconds with this change.
  3. Your output of the count of primes found is not very efficient as it requires an array access and an if condition per candidate prime. Once you have the sieve buffer as an array packed word array of bits, you can do this much more efficiently with a counting Look Up Table (LUT) which eliminates the if condition and only needs two array accesses per bit packed word. Doing this, the time to count becomes a negligible part of the work as compared to the time to cull composite numbers, for a further saving to get down to perhaps eight seconds for the count of the primes to one billion.
  4. Further reductions in the number of processed prime candidates can be the result of applying wheel factorization, which removes say the factors of the primes 2, 3, and 5 from the processing and by adjusting the method of bit packing can also increase the effective range of a given size bit buffer by a factor of another about two. This can reduce the number of composite number culling operations by another huge factor of up to over three times, although at the cost of further computational complexity.
  5. In order to further reduce memory use, making memory accesses even more efficient, and preparing the way for multiprocessing per page segment, one can divide the work into pages that are no larger than the L1 or L2 cache sizes. This requires that one keep a base primes table of all the primes up to the square root of the maximum prime candidate and recomputes the starting address parameters of each of the base primes used in culling across a given page segment, but this is still more efficient than using huge culling arrays. An added benefit to implementing this page segmenting is that one then does not have to specify the upper sieving limit in advance but rather can just extend the base primes as necessary as further upper pages are processed. With all of the optimizations to this point, you can likely produce the count of primes up to one billion in about 2.5 seconds.
  6. Finally, one can put the final touches on multiprocessing the page segments using TPL or Threads, which using a buffer size of about the L2 cache size (per core) will produce an addition gain of a factor of two on your dual core non Hyper Threaded (HT) older processor as the Intel E7500 Core2Duo for an execute time to find the number of primes to one billion of about 1.25 seconds or so.

我已实施埃拉托塞尼的多线程筛作为<一href=\"http://stackoverflow.com/questions/1569393/c-how-to-make-sieve-of-atkin-incremental/18139477#18139477\">answer到另一个线程显示没有任何优势,阿特金超过埃拉托色尼的筛筛。它使用任务并行库(TPL)作为任务和TaskFactory所以至少需要DOTNET的框架4.我进一步调整使用上述的所有的备用答案相同quesion 。我重新发布了调整code这里添加了注释,更容易阅读的格式如下:

I have implemented a multi-threaded Sieve of Eratosthenes as an answer to another thread to show there isn't any advantage to the Sieve of Atkin over the Sieve of Eratosthenes. It uses the Task Parallel Library (TPL) as in Tasks and TaskFactory so requires at least DotNet Framework 4. I have further tweaked that code using all of the optimizations discussed above as an alternate answer to the same quesion. I re-post that tweaked code here with added comments and easier-to-read formatting, as follows:

  using System;
  using System.Collections;
  using System.Collections.Generic;
  using System.Linq;
  using System.Threading;
  using System.Threading.Tasks;

  class UltimatePrimesSoE : IEnumerable<ulong> {
    #region const and static readonly field's, private struct's and classes

    //one can get single threaded performance by setting NUMPRCSPCS = 1
    static readonly uint NUMPRCSPCS = (uint)Environment.ProcessorCount + 1;
    //the L1CACHEPOW can not be less than 14 and is usually the two raised to the power of the L1 or L2 cache
    const int L1CACHEPOW = 14, L1CACHESZ = (1 << L1CACHEPOW), MXPGSZ = L1CACHESZ / 2; //for buffer ushort[]
    const uint CHNKSZ = 17; //this times BWHLWRDS (below) times two should not be bigger than the L2 cache in bytes
    //the 2,3,57 factorial wheel increment pattern, (sum) 48 elements long, starting at prime 19 position
    static readonly byte[] WHLPTRN = { 2,3,1,3,2,1,2,3,3,1,3,2,1,3,2,3,4,2,1,2,1,2,4,3,
                                       2,3,1,2,3,1,3,3,2,1,2,3,1,3,2,1,2,1,5,1,5,1,2,1 }; const uint FSTCP = 11;
    static readonly byte[] WHLPOS; static readonly byte[] WHLNDX; //look up wheel position from index and vice versa
    static readonly byte[] WHLRNDUP; //to look up wheel rounded up index positon values, allow for overflow in size
    static readonly uint WCRC = WHLPTRN.Aggregate(0u, (acc, n) => acc + n); //small wheel circumference for odd numbers
    static readonly uint WHTS = (uint)WHLPTRN.Length; static readonly uint WPC = WHTS >> 4; //number of wheel candidates
    static readonly byte[] BWHLPRMS = { 2,3,5,7,11,13,17 }; const uint FSTBP = 19; //big wheel primes, following prime
    //the big wheel circumference expressed in number of 16 bit words as in a minimum bit buffer size
    static readonly uint BWHLWRDS = BWHLPRMS.Aggregate(1u, (acc, p) => acc * p) / 2 / WCRC * WHTS / 16;
    //page size and range as developed from the above
    static readonly uint PGSZ = MXPGSZ / BWHLWRDS * BWHLWRDS; static readonly uint PGRNG = PGSZ * 16 / WHTS * WCRC;
    //buffer size (multiple chunks) as produced from the above
    static readonly uint BFSZ = CHNKSZ * PGSZ, BFRNG = CHNKSZ * PGRNG; //number of uints even number of caches in chunk
    static readonly ushort[] MCPY; //a Master Copy page used to hold the lower base primes preculled version of the page
    struct Wst { public ushort msk; public byte mlt; public byte xtr; public ushort nxt; }
    static readonly byte[] PRLUT; /*Wheel Index Look Up Table */ static readonly Wst[] WSLUT; //Wheel State Look Up Table
    static readonly byte[] CLUT; // a Counting Look Up Table for very fast counting of primes

    class Bpa { //very efficient auto-resizing thread-safe read-only indexer class to hold the base primes array
      byte[] sa = new byte[0]; uint lwi = 0, lpd = 0; object lck = new object();
      public uint this[uint i] {
        get {
          if (i >= this.sa.Length) lock (this.lck) {
              var lngth = this.sa.Length; while (i >= lngth) {
                var bf = (ushort[])MCPY.Clone(); if (lngth == 0) {
                  for (uint bi = 0, wi = 0, w = 0, msk = 0x8000, v = 0; w < bf.Length;
                      bi += WHLPTRN[wi++], wi = (wi >= WHTS) ? 0 : wi) {
                    if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1;
                    if ((v & msk) == 0) {
                      var p = FSTBP + (bi + bi); var k = (p * p - FSTBP) >> 1;
                      if (k >= PGRNG) break; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
                      for (uint wrd = kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < bf.Length; ) {
                        var st = WSLUT[ndx]; bf[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt;
                      }
                    }
                  }
                }
                else { this.lwi += PGRNG; cullbf(this.lwi, bf); }
                var c = count(PGRNG, bf); var na = new byte[lngth + c]; sa.CopyTo(na, 0);
                for (uint p = FSTBP + (this.lwi << 1), wi = 0, w = 0, msk = 0x8000, v = 0;
                    lngth < na.Length; p += (uint)(WHLPTRN[wi++] << 1), wi = (wi >= WHTS) ? 0 : wi) {
                  if (msk >= 0x8000) { msk = 1; v = bf[w++]; } else msk <<= 1; if ((v & msk) == 0) {
                    var pd = p / WCRC; na[lngth++] = (byte)(((pd - this.lpd) << 6) + wi); this.lpd = pd;
                  }
                } this.sa = na;
              }
            } return this.sa[i];
        }
      }
    }
    static readonly Bpa baseprms = new Bpa(); //the base primes array using the above class

    struct PrcsSpc { public Task tsk; public ushort[] buf; } //used for multi-threading buffer array processing

    #endregion

    #region private static methods

    static int count(uint bitlim, ushort[] buf) { //very fast counting method using the CLUT look up table
      if (bitlim < BFRNG) {
        var addr = (bitlim - 1) / WCRC; var bit = WHLNDX[bitlim - addr * WCRC] - 1; addr *= WPC;
        for (var i = 0; i < 3; ++i) buf[addr++] |= (ushort)((unchecked((ulong)-2) << bit) >> (i << 4));
      }
      var acc = 0; for (uint i = 0, w = 0; i < bitlim; i += WCRC)
        acc += CLUT[buf[w++]] + CLUT[buf[w++]] + CLUT[buf[w++]]; return acc;
    }

    static void cullbf(ulong lwi, ushort[] b) { //fast buffer segment culling method using a Wheel State Look Up Table
      ulong nlwi = lwi;
      for (var i = 0u; i < b.Length; nlwi += PGRNG, i += PGSZ) MCPY.CopyTo(b, i); //copy preculled lower base primes.
      for (uint i = 0, pd = 0; ; ++i) {
        pd += (uint)baseprms[i] >> 6;
        var wi = baseprms[i] & 0x3Fu; var wp = (uint)WHLPOS[wi]; var p = pd * WCRC + PRLUT[wi];
        var k = ((ulong)p * (ulong)p - FSTBP) >> 1;
        if (k >= nlwi) break; if (k < lwi) {
          k = (lwi - k) % (WCRC * p);
          if (k != 0) {
            var nwp = wp + (uint)((k + p - 1) / p); k = (WHLRNDUP[nwp] - wp) * p - k;
          }
        }
        else k -= lwi; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
        for (uint wrd = (uint)kd * WPC + (uint)(kn >> 4), ndx = wi * WHTS + kn; wrd < b.Length; ) {
          var st = WSLUT[ndx]; b[wrd] |= st.msk; wrd += st.mlt * pd + st.xtr; ndx = st.nxt;
        }
      }
    }

    static Task cullbftsk(ulong lwi, ushort[] b, Action<ushort[]> f) { // forms a task of the cull buffer operaion
      return Task.Factory.StartNew(() => { cullbf(lwi, b); f(b); });
    }

    //iterates the action over each page up to the page including the top_number,
    //making an adjustment to the top limit for the last page.
    //this method works for non-dependent actions that can be executed in any order.
    static void IterateTo(ulong top_number, Action<ulong, uint, ushort[]> actn) {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc {
        buf = new ushort[BFSZ],
        tsk = Task.Factory.StartNew(() => { })
      };
      var topndx = (top_number - FSTBP) >> 1; for (ulong ndx = 0; ndx <= topndx; ) {
        ps[0].tsk.Wait(); var buf = ps[0].buf; for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1];
        var lowi = ndx; var nxtndx = ndx + BFRNG; var lim = topndx < nxtndx ? (uint)(topndx - ndx + 1) : BFRNG;
        ps[NUMPRCSPCS - 1] = new PrcsSpc { buf = buf, tsk = cullbftsk(ndx, buf, (b) => actn(lowi, lim, b)) };
        ndx = nxtndx;
      } for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s].tsk.Wait();
    }

    //iterates the predicate over each page up to the page where the predicate paramenter returns true,
    //this method works for dependent operations that need to be executed in increasing order.
    //it is somewhat slower than the above as the predicate function is executed outside the task.
    static void IterateUntil(Func<ulong, ushort[], bool> prdct) {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS];
      for (var s = 0u; s < NUMPRCSPCS; ++s) {
        var buf = new ushort[BFSZ];
        ps[s] = new PrcsSpc { buf = buf, tsk = cullbftsk(s * BFRNG, buf, (bfr) => { }) };
      }
      for (var ndx = 0UL; ; ndx += BFRNG) {
        ps[0].tsk.Wait(); var buf = ps[0].buf; var lowi = ndx; if (prdct(lowi, buf)) break;
        for (var s = 0u; s < NUMPRCSPCS - 1; ++s) ps[s] = ps[s + 1];
        ps[NUMPRCSPCS - 1] = new PrcsSpc {
          buf = buf,
          tsk = cullbftsk(ndx + NUMPRCSPCS * BFRNG, buf, (bfr) => { })
        };
      }
    }

    #endregion

    #region initialization

    /// <summary>
    /// the static constructor is used to initialize the static readonly arrays.
    /// </summary>
    static UltimatePrimesSoE() {
      WHLPOS = new byte[WHLPTRN.Length + 1]; //to look up wheel position index from wheel index
      for (byte i = 0, acc = 0; i < WHLPTRN.Length; ++i) { acc += WHLPTRN[i]; WHLPOS[i + 1] = acc; }
      WHLNDX = new byte[WCRC + 1]; for (byte i = 1; i < WHLPOS.Length; ++i) {
        for (byte j = (byte)(WHLPOS[i - 1] + 1); j <= WHLPOS[i]; ++j) WHLNDX[j] = i;
      }
      WHLRNDUP = new byte[WCRC * 2]; for (byte i = 1; i < WHLRNDUP.Length; ++i) {
        if (i > WCRC) WHLRNDUP[i] = (byte)(WCRC + WHLPOS[WHLNDX[i - WCRC]]); else WHLRNDUP[i] = WHLPOS[WHLNDX[i]];
      }
      Func<ushort, int> nmbts = (v) => { var acc = 0; while (v != 0) { acc += (int)v & 1; v >>= 1; } return acc; };
      CLUT = new byte[1 << 16]; for (var i = 0; i < CLUT.Length; ++i) CLUT[i] = (byte)nmbts((ushort)(i ^ -1));
      PRLUT = new byte[WHTS]; for (var i = 0; i < PRLUT.Length; ++i) {
        var t = (uint)(WHLPOS[i] * 2) + FSTBP; if (t >= WCRC) t -= WCRC; if (t >= WCRC) t -= WCRC; PRLUT[i] = (byte)t;
      }
      WSLUT = new Wst[WHTS * WHTS]; for (var x = 0u; x < WHTS; ++x) {
        var p = FSTBP + 2u * WHLPOS[x]; var pr = p % WCRC;
        for (uint y = 0, pos = (p * p - FSTBP) / 2; y < WHTS; ++y) {
          var m = WHLPTRN[(x + y) % WHTS];
          pos %= WCRC; var posn = WHLNDX[pos]; pos += m * pr; var nposd = pos / WCRC; var nposn = WHLNDX[pos - nposd * WCRC];
          WSLUT[x * WHTS + posn] = new Wst {
            msk = (ushort)(1 << (int)(posn & 0xF)),
            mlt = (byte)(m * WPC),
            xtr = (byte)(WPC * nposd + (nposn >> 4) - (posn >> 4)),
            nxt = (ushort)(WHTS * x + nposn)
          };
        }
      }
      MCPY = new ushort[PGSZ]; foreach (var lp in BWHLPRMS.SkipWhile(p => p < FSTCP)) {
        var p = (uint)lp;
        var k = (p * p - FSTBP) >> 1; var pd = p / WCRC; var kd = k / WCRC; var kn = WHLNDX[k - kd * WCRC];
        for (uint w = kd * WPC + (uint)(kn >> 4), ndx = WHLNDX[(2 * WCRC + p - FSTBP) / 2] * WHTS + kn; w < MCPY.Length; ) {
          var st = WSLUT[ndx]; MCPY[w] |= st.msk; w += st.mlt * pd + st.xtr; ndx = st.nxt;
        }
      }
    }

    #endregion

    #region public class

    // this class implements the enumeration (IEnumerator).
    //    it works by farming out tasks culling pages, which it then processes in order by
    //    enumerating the found primes as recognized by the remaining non-composite bits
    //    in the cull page buffers.
    class nmrtr : IEnumerator<ulong>, IEnumerator, IDisposable {
      PrcsSpc[] ps = new PrcsSpc[NUMPRCSPCS]; ushort[] buf;
      public nmrtr() {
        for (var s = 0u; s < NUMPRCSPCS; ++s) ps[s] = new PrcsSpc { buf = new ushort[BFSZ] };
        for (var s = 1u; s < NUMPRCSPCS; ++s) {
          ps[s].tsk = cullbftsk((s - 1u) * BFRNG, ps[s].buf, (bfr) => { });
        } buf = ps[0].buf;
      }
      ulong _curr, i = (ulong)-WHLPTRN[WHTS - 1]; int b = -BWHLPRMS.Length - 1; uint wi = WHTS - 1; ushort v, msk = 0;
      public ulong Current { get { return this._curr; } } object IEnumerator.Current { get { return this._curr; } }
      public bool MoveNext() {
        if (b < 0) {
          if (b == -1) b += buf.Length; //no yield!!! so automatically comes around again
          else { this._curr = (ulong)BWHLPRMS[BWHLPRMS.Length + (++b)]; return true; }
        }
        do {
          i += WHLPTRN[wi++]; if (wi >= WHTS) wi = 0; if ((this.msk <<= 1) == 0) {
            if (++b >= BFSZ) {
              b = 0; for (var prc = 0; prc < NUMPRCSPCS - 1; ++prc) ps[prc] = ps[prc + 1];
              ps[NUMPRCSPCS - 1u].buf = buf;
              ps[NUMPRCSPCS - 1u].tsk = cullbftsk(i + (NUMPRCSPCS - 1u) * BFRNG, buf, (bfr) => { });
              ps[0].tsk.Wait(); buf = ps[0].buf;
            } v = buf[b]; this.msk = 1;
          }
        }
        while ((v & msk) != 0u); _curr = FSTBP + i + i; return true;
      }
      public void Reset() { throw new Exception("Primes enumeration reset not implemented!!!"); }
      public void Dispose() { }
    }

    #endregion

    #region public instance method and associated sub private method

    /// <summary>
    /// Gets the enumerator for the primes.
    /// </summary>
    /// <returns>The enumerator of the primes.</returns>
    public IEnumerator<ulong> GetEnumerator() { return new nmrtr(); }

    /// <summary>
    /// Gets the enumerator for the primes.
    /// </summary>
    /// <returns>The enumerator of the primes.</returns>
    IEnumerator IEnumerable.GetEnumerator() { return new nmrtr(); }

    #endregion

    #region public static methods

    /// <summary>
    /// Gets the count of primes up the number, inclusively.
    /// </summary>
    /// <param name="top_number">The ulong top number to check for prime.</param>
    /// <returns>The long number of primes found.</returns>
    public static long CountTo(ulong top_number) {
      if (top_number < FSTBP) return BWHLPRMS.TakeWhile(p => p <= top_number).Count();
      var cnt = (long)BWHLPRMS.Length;
      IterateTo(top_number, (lowi, lim, b) => { Interlocked.Add(ref cnt, count(lim, b)); }); return cnt;
    }

    /// <summary>
    /// Gets the sum of the primes up the number, inclusively.
    /// </summary>
    /// <param name="top_number">The uint top number to check for prime.</param>
    /// <returns>The ulong sum of all the primes found.</returns>
    public static ulong SumTo(uint top_number) {
      if (top_number < FSTBP) return (ulong)BWHLPRMS.TakeWhile(p => p <= top_number).Aggregate(0u, (acc, p) => acc += p);
      var sum = (long)BWHLPRMS.Aggregate(0u, (acc, p) => acc += p);
      Func<ulong, uint, ushort[], long> sumbf = (lowi, bitlim, buf) => {
        var acc = 0L; for (uint i = 0, wi = 0, msk = 0x8000, w = 0, v = 0; i < bitlim;
            i += WHLPTRN[wi++], wi = wi >= WHTS ? 0 : wi) {
          if (msk >= 0x8000) { msk = 1; v = buf[w++]; } else msk <<= 1;
          if ((v & msk) == 0) acc += (long)(FSTBP + ((lowi + i) << 1));
        } return acc;
      };
      IterateTo(top_number, (pos, lim, b) => { Interlocked.Add(ref sum, sumbf(pos, lim, b)); }); return (ulong)sum;
    }

    /// <summary>
    /// Gets the prime number at the zero based index number given.
    /// </summary>
    /// <param name="index">The long zero-based index number for the prime.</param>
    /// <returns>The ulong prime found at the given index.</returns>
    public static ulong ElementAt(long index) {
      if (index < BWHLPRMS.Length) return (ulong)BWHLPRMS.ElementAt((int)index);
      long cnt = BWHLPRMS.Length; var ndx = 0UL; var cycl = 0u; var bit = 0u; IterateUntil((lwi, bfr) => {
        var c = count(BFRNG, bfr); if ((cnt += c) < index) return false; ndx = lwi; cnt -= c; c = 0;
        do { var w = cycl++ * WPC; c = CLUT[bfr[w++]] + CLUT[bfr[w++]] + CLUT[bfr[w]]; cnt += c; } while (cnt < index);
        cnt -= c; var y = (--cycl) * WPC; ulong v = ((ulong)bfr[y + 2] << 32) + ((ulong)bfr[y + 1] << 16) + bfr[y];
        do { if ((v & (1UL << ((int)bit++))) == 0) ++cnt; } while (cnt <= index); --bit; return true;
      }); return FSTBP + ((ndx + cycl * WCRC + WHLPOS[bit]) << 1);
    }

    #endregion
  }

以上code将枚举素数在对四个核心约1.55秒钟就有一个十亿(八线程包括HT)i7-2700K(3.5 GHz)和你的E7500将成为可能高达较慢由于较少线程和略少时钟速度的四倍。大约四分之三的时间就是刚刚运行枚举的MoveNext()方法和当前财产的时候,所以我提供公共静态方法CountTo,SumTo和ElementAt的来计算一个素数的数量或金额范围和第n从零开始素,分别在不使用枚举。使用UltimatePrimesSoE.CountTo(1000000000)静态方法在我的机器上大约0.32秒生产50847534,因此比英特尔E7500。

The above code will enumerate the primes to one billion in about 1.55 seconds on a four core (eight threads including HT) i7-2700K (3.5 GHz) and your E7500 will be perhaps up to four times slower due to less threads and slightly less clock speed. About three quarters of that time is just the time to run the enumeration MoveNext() method and Current property, so I provide the public static methods "CountTo", "SumTo" and "ElementAt" to compute the number or sum of primes in a range and the nth zero-based prime, respectively, without using enumeration. Using the UltimatePrimesSoE.CountTo(1000000000) static method produces 50847534 in about 0.32 seconds on my machine, so shouldn't take longer than about 1.28 seconds on the Intel E7500.

EDIT_ADD:有趣的是,code运行30%的x86 32位模式快于64位的64位模式,可能是由于避免延长UINT32数字的轻微额外开销到ULONG的。所有上述定时的是64位模式。 END_EDIT_ADD

EDIT_ADD: Interestingly, this code runs 30% faster in x86 32-bit mode than in x64 64-bit mode, likely due to avoiding the slight extra overhead of extending the uint32 numbers to ulong's. All of the above timings are for 64-bit mode. END_EDIT_ADD

在code几乎300(密)线,这实现并不简单,但是这是在做所有描述的优化,使这个code如此高效的成本。这是不是所有$ C $的有更多的行C的亚伦Murgatroyd对方回答;虽然他的code是密度较小,他的code是也大约四倍慢。事实上,几乎所有的执行时间都花在了我的code的私有静态cullbf的方法,这是只有四个语句加上长的范围条件检查的最后一个循环;在code的所有剩下的只是支持该循环反复应用。

At almost 300 (dense) lines of code, this implementation isn't simple, but that's the cost of doing all of the described optimizations that make this code so efficient. It isn't all that many more lines of code that the other answer by Aaron Murgatroyd; although his code is less dense, his code is also about four times as slow. In fact, almost all of the execution time is spent in the final "for loop" of the my code's private static "cullbf" method, which is only four statements long plus the range condition check; all the rest of the code is just in support of repeated applications of that loop.

究其原因,这code是比其他答案快是出于同样的原因,这code是比其他的code比他只处理奇数的步骤(1)优化速度更快总理候选人。他使​​用的多是在只有30%的优势,而不是在正确应用,因为它每首相,而不是对小网页上的所有素数线程应该是可能的一个真正的四核CPU的四个因素几乎完全失效,而他使用作为消除束缚每个回路检查数组的DOTNET的计算成本的方法不安全的指针数组访问实际上减缓了$ C $相比,只是使用数组直接包括边界检查的DOTNET的只是时间C(JIT)编译器生成相当低效code的指针访问。此外,他的code列举了素数,就像我的code可以做的,这枚举有每个枚举主CPU时钟周期成本,这也是他的情况略差的10作为他使用内置的C#迭代其中有些比我滚你自己的IEnumerator接口效率较低。然而,最大速度,我们应该避免完全列举;然而,即使他提供的计数实例方法使用一个foreach循环,这意味着枚举。

The reasons that this code is faster than that other answer are for the same reasons that this code is faster than your code other than he does the Step (1) optimization of only processing odd prime candidates. His use of multiprocessing is almost completely ineffective as in only a 30% advantage rather than the factor of four that should be possible on a true four core CPU when applied correctly as it threads per prime rather than for all primes over small pages, and his use of unsafe pointer array access as a method of eliminating the DotNet computational cost of an array bound check per loop actually slows the code compared to just using arrays directly including the bounds check as the DotNet Just In Time (JIT) compiler produces quite inefficient code for pointer access. In addition, his code enumerates the primes just as my code can do, which enumeration has a 10's of CPU clock cycle cost per enumerated prime, which is also slightly worse in his case as he uses the built-in C# iterators which are somewhat less efficient than my "roll-your-own" IEnumerator interface. However, for maximum speed, we should avoid enumeration entirely; however even his supplied "Count" instance method uses a "foreach" loop which means enumeration.

总之,这个答案code你的E7500 CPU上产生质的答案不是你的问题的code快约25倍(与更多的核心/线程CPU上有更多倍的速度),占用更少的内存,并且并不限定于在对32位的数字范围时首要范围,但在增加的code复杂的成本。

In summary, this answer code produces prime answers about 25 times faster than your question's code on your E7500 CPU (many more times faster on a CPU with more cores/threads) uses much less memory, and is not limited to smaller prime ranges of about the 32-bit number range, but at a cost of increased code complexity.

这篇关于我如何使用多线程C#实现筛埃拉托色尼?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆