在无需声明巨大的2D数组的情况下找到平均尺寸分​​布的有效方法是什么? [英] What would be an efficient way to find the averaged size distribution without having to declare a gigantic 2D array?

查看:74
本文介绍了在无需声明巨大的2D数组的情况下找到平均尺寸分​​布的有效方法是什么?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

考虑一个L x L大小的矩阵M,其条目可以为0或1.每个元素为1的概率为p,为0的概率为1-p.我将标记为1的元素称为黑色元素,将标记为0的元素称为白色元素.我正在尝试编写以下代码:

  • 生成带有条目0和1的随机矩阵.我需要输入矩阵L的大小和概率p.

  • 标记属于同一簇的所有黑色元素的编号相同. (将黑色元素的簇定义为值为1的单元格图中的最大连接分量,其中边连接其行和列相差至多1(每个单元格最多八个邻居)的单元格换句话说,如果矩阵的两个黑色元素共享一条边或一个顶点,则将它们视为属于同一黑色簇,也就是说,将矩阵视为一个大正方形,而将元素视为该大正方形中的小正方形./em>)

  • 在一个从p = 0到p = 100的循环中(我将概率p表示为百分比):将每种大小的黑色簇的数量写入对应于该p值的文件./li>

例如:

输入:p = 30,L = 50

Output (将每个p写入数据文件;因此该程序创建了101个数据文件,从p = 0到p = 100):

1100(即有100个大小为1的黑色簇)

2 20(即,有20个大小为2的黑色簇)

3 15(即,有15个大小为3的黑色簇)

以此类推...

#include "stdafx.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <string.h>
int *labels;
int  n_labels = 0;

int uf_find(int x) {
    int y = x;
    while (labels[y] != y)
        y = labels[y];

    while (labels[x] != x) {
        int z = labels[x];
        labels[x] = y;
        x = z;
    }
    return y;
}

int uf_union(int x, int y) {
    return labels[uf_find(x)] = uf_find(y);
}

int uf_make_set(void) {
    labels[0] ++;
    assert(labels[0] < n_labels);
    labels[labels[0]] = labels[0];
    return labels[0];
}

void uf_initialize(int max_labels) {
    n_labels = max_labels;
    labels = (int*)calloc(sizeof(int), n_labels);
    labels[0] = 0;
}

void uf_done(void) {
    free(labels);
    labels = 0;
}

#define max(a,b) (a>b?a:b)
#define min(a,b) (a>b?b:a)

int hoshen_kopelman(int **matrix, int m, int n) {

    uf_initialize(m * n / 2);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {

                int up = (i == 0 ? 0 : matrix[i - 1][j]);
                int left = (j == 0 ? 0 : matrix[i][j - 1]);

                switch (!!up + !!left) {

                case 0:
                    matrix[i][j] = uf_make_set();
                    break;

                case 1:
                    matrix[i][j] = max(up, left);
                    break;

                case 2:
                    matrix[i][j] = uf_union(up, left);
                    break;
                }
                int north_west;
                if (i == 0 || j == 0)
                    north_west = 0;
                else
                    north_west = matrix[i - 1][j - 1];

                int north_east;
                if (i == 0 || j == (n - 1))
                    north_east = 0;
                else
                    north_east = matrix[i - 1][j + 1];

                if (!!north_west == 1)
                {
                    if (i != 0 && j != 0)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i][j - 1] == 0 && matrix[i - 1][j] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_west;
                                else
                                    uf_union(north_west, matrix[i][j]);
                            }
                        //}

                    }
                    else if (i == 0 || j == 0)
                    {

                    }

                }
                if (!!north_east == 1)
                {
                    if (i != 0 && j != n - 1)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i - 1][j] == 0 && matrix[i][j + 1] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_east;
                                else
                                    uf_union(north_east, matrix[i][j]);
                            }
                        //}
                    }
                    else if (i == 0 || j == n - 1)
                    {

                    }
                }
            }
    int *new_labels = (int*)calloc(sizeof(int), n_labels);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                int x = uf_find(matrix[i][j]);
                if (new_labels[x] == 0) {
                    new_labels[0]++;
                    new_labels[x] = new_labels[0];
                }
                matrix[i][j] = new_labels[x];
            }

    int total_clusters = new_labels[0];

    free(new_labels);
    uf_done();

    return total_clusters;
}

void check_labelling(int **matrix, int m, int n) {
    int N, S, E, W;
    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                N = (i == 0 ? 0 : matrix[i - 1][j]);
                S = (i == m - 1 ? 0 : matrix[i + 1][j]);
                E = (j == n - 1 ? 0 : matrix[i][j + 1]);
                W = (j == 0 ? 0 : matrix[i][j - 1]);

                assert(N == 0 || matrix[i][j] == N);
                assert(S == 0 || matrix[i][j] == S);
                assert(E == 0 || matrix[i][j] == E);
                assert(W == 0 || matrix[i][j] == W);
            }
}

int cluster_count(int **matrix, int size, int N)
{
    int i;
    int j;
    int count = 0;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            if (matrix[i][j] == N)
                count++;
        }
    }
    return count;
}

int main(int argc, char **argv)
{
    srand((unsigned int)time(0));
    int p = 0;
    printf("Enter number of rows/columns: ");
    int size = 0;
    scanf("%d", &size);
    printf("\n");
    FILE *fp;

    printf("Enter number of averaging iterations: ");
    int iterations = 0;

    scanf("%d", &iterations);

        for (int p = 0; p <= 100; p++)
        { 
            char str[100];
            sprintf(str, "BlackSizeDistribution%03i.txt", p);
            fp = fopen(str, "w");
            int **matrix;
            matrix = (int**)calloc(10, sizeof(int*));
            int** matrix_new = (int**)calloc(10, sizeof(int*));
            matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
            matrix = matrix_new;
        for (int i = 0; i < size; i++)
        {
            matrix[i] = (int *)calloc(size, sizeof(int));
            for (int j = 0; j < size; j++)
            {
                int z = rand() % 100;
                z = z + 1;
                if (p == 0)
                    matrix[i][j] = 0;
                if (z <= p)
                    matrix[i][j] = 1;
                else
                    matrix[i][j] = 0;
            }
        }

        hoshen_kopelman(matrix, size, size);
        int highest = matrix[0][0];
        for (int i = 0; i < size; i++)
            for (int j = 0; j < size; j++)
                if (highest < matrix[i][j])
                    highest = matrix[i][j];
        int* counter = (int*)calloc(sizeof(int*), highest + 1);
        int high = 0;
        for (int k = 1; k <= highest; k++)
        {
            counter[k] = cluster_count(matrix, size, k);
            if (counter[k] > high)
                high = counter[k];
        }
        int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

        for (int y = 1; y <= high; y++)
        {
           int count = 0;
           for (int z = 1; z <= highest; z++)
               if (counter[z] == y)
                   count++;
           size_distribution[y] = count;
        }

        for (int k = 1; k <= high; k++)
        {
            fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
            printf("\n%d\t%d", k, size_distribution[k]);
        }
        printf("\n");
        check_labelling(matrix, size, size);
        for (int i = 0; i < size; i++)
            free(matrix[i]);
        free(matrix);
        }
}

我使用该程序输出的数据文件,使用Gnuplot为0至100的每个p创建条形图.

一些示例图:

此对应于输入p = 3且L = 100

这个对应于p = 90且L = 100


好的,所以我认为这足以解决我提出的实际问题.

我编写的程序只为p的值输出1次迭代的值.由于该程序是用于科学计算的,因此我需要更准确的值,并且为此,我需要输出平均"大小分布值.超过50或100次迭代.我不确定如何进行平均.

更清楚地说,这就是我想要的:

说程序的三个不同运行的输出给我(比如说p = 3,L = 100)

Size    Iteration 1    Iteration 2    Iteration 3  Averaged Value (Over 3 iterations)

1       150            170             180         167

2       10             20              15          18

3       1              2               1           1

4       0              0               1           0  
.
.
.

我想做类似的事情,除了我需要对50或100次迭代进行平均以获取数学模型的准确值.顺便说一句,我真的不需要在数据文件中输出每次迭代的值,即迭代1,迭代2,迭代3....我需要打印"的是上表中的第一列和最后一列(具有平均值).

现在,我可能需要修改main函数以进行平均.

主要功能如下:

int main(int argc, char **argv)
    {
        srand((unsigned int)time(0));
        int p = 0;
        printf("Enter number of rows/columns: ");
        int size = 0;
        scanf("%d", &size);
        printf("\n");
        FILE *fp;

        printf("Enter number of averaging iterations: ");
        int iterations = 0;

        scanf("%d", &iterations);

            for (int p = 0; p <= 100; p++)
            { 
                char str[100];
                sprintf(str, "BlackSizeDistribution%03i.txt", p);
                fp = fopen(str, "w");
                int **matrix;
                matrix = (int**)calloc(10, sizeof(int*));
                int** matrix_new = (int**)calloc(10, sizeof(int*));
                matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
                matrix = matrix_new;
            for (int i = 0; i < size; i++)
            {
                matrix[i] = (int *)calloc(size, sizeof(int));
                for (int j = 0; j < size; j++)
                {
                    int z = rand() % 100;
                    z = z + 1;
                    if (p == 0)
                        matrix[i][j] = 0;
                    if (z <= p)
                        matrix[i][j] = 1;
                    else
                        matrix[i][j] = 0;
                }
            }

            hoshen_kopelman(matrix, size, size);
            int highest = matrix[0][0];
            for (int i = 0; i < size; i++)
                for (int j = 0; j < size; j++)
                    if (highest < matrix[i][j])
                        highest = matrix[i][j];
            int* counter = (int*)calloc(sizeof(int*), highest + 1);
            int high = 0;
            for (int k = 1; k <= highest; k++)
            {
                counter[k] = cluster_count(matrix, size, k);
                if (counter[k] > high)
                    high = counter[k];
            }
            int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

            for (int y = 1; y <= high; y++)
            {
               int count = 0;
               for (int z = 1; z <= highest; z++)
                   if (counter[z] == y)
                       count++;
               size_distribution[y] = count;
            }

            for (int k = 1; k <= high; k++)
            {
                fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
                printf("\n%d\t%d", k, size_distribution[k]);
            }
            printf("\n");
            check_labelling(matrix, size, size);
            for (int i = 0; i < size; i++)
                free(matrix[i]);
            free(matrix);
            }
    }

我想到的方法之一是针对每次p循环声明一个大小为(largest size of black cluster possible for that) x (number of averaging iterations I want)的2D数组,并在程序结尾处从2D数组中提取每个p的平均大小分布.在大小为100 x 100(例如)的矩阵中,黑色簇的最大大小为10,000.但是对于较小的p值(例如p = 1,p = 20,...),这样的大型簇甚至都不会产生!因此,在一开始创建如此大的2D数组将浪费大量的内存空间,并且需要几天的时间来执行! (请记住,我需要为L = 1000和L = 5000运行此程序,如果可能的话,甚至需要为L = 10000运行该程序)

必须有其他更有效的方法来执行此操作.但是我不知道.任何帮助表示赞赏.

解决方案

啊,这个话题非常贴切! :)

由于您正在收集统计信息,并且仅在收集所述统计信息的过程中才需要进行准确的配置,因此您只需要两个L×L矩阵:一个用于保存颜色信息(一个白色,0到L×L黑色,因此可以容纳0到L 2 +1(包括两端)之间的整数的类型);另一个保存该类型的元素的数量(0到L 2 之间的整数(包括0和L 2 ).

要在多个迭代中保存每个L和p值每个大小的黑色簇的数量,您将需要第三个L×L + 1个元素数组;但是请注意,如果使用相同的L和p进行N次迭代,则其值可以达到N×L×L.

标准的C伪随机数生成器太可怕了;不要使用它.使用 Merenne Twister 或我最喜欢的

存储"uninteresting"或集群ID而不是仅存储"black"或"white";最初,所有(未连接)黑人都具有唯一的群集ID.这样,您可以实现不相交集并将连接的单元非常有效地连接到集群

这是我用伪代码实现的方式:

Let id[L*L]        be the color information; index is L*row+col
Let idcount[L*L+1] be the id count over one iteration
Let count[L*L+1]   be the number of clusters across N iterations

Let limit = (p / 100.0) * PRNG_MAX

Let white = L*L    (and blacks are 0 to L*L-1, inclusive)
Clear count[] to all zeros

For iter = 1 to N, inclusive:

    For row = 0 to L-1, inclusive:

        If PRNG() <= limit:
            Let id[L*row] = L*row
        Else:
            Let id[L*row] = white
        End If

        For col = 1 to L-1, inclusive:
            If PRNG() <= limit:
                If id[L*row+col-1] == white:
                    Let id[L*row+col] = L*row + col
                Else:
                    Let id[L*row+col] = id[L*row+col-1]
                End If
            Else:
                Let id[L*row+col] = white
            End If
        End For

    End For

请注意,在生成单元格ID时,我只是通过对连续的黑色单元格重复使用相同的ID来进行水平连接.此时,如果PRNG()返回介于0和PRNG_MAX之间(包括0和id[][]现在将填充各种ID的黑色单元格的可能性为p.

接下来,您要进行联接.由于水平连接的单元已经连接在一起,因此需要进行垂直和两个对角线的连接.请注意,为了提高效率,在连接时应使用路径展平.

    For row = 0 to L-2, inclusive:
        For col = 0 to L-1, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col] != white):
                Join id[L*row+col] and id[L*row+L+col]
            End If
        End For
    End For

    For row = 0 to L-2, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col+1] != white):
                Join id[L*row+col] and id[L*row+L+col+1]
            End If
        End For
    End For

    For row = 1 to L-1, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row-L+col+1] != white):
                Join id[L*row+col] and id[L*row-L+col+1]
            End If
        End For
    End For

当然,您应该组合循环,以保持访问的最佳位置. (分别执行col == 0,并在单独的内部循环中执行row == 0,以使if子句最小化,因为它们往往很慢.)您甚至可以使id数组(L + 1) 2 的大小,用白色填充单元格的外部边缘,以简化上述连接,形成单个双环,但需要额外使用4L + 4的单元格.

这时,您需要展平每个不相交的集合.不是white的每个ID值要么是L*row+col(表示"this"),要么是对另一个单元格的引用.展平意味着对于每个ID,我们在链中找到最终的ID,并改为使用它:

    For i = 0 to L*L-1, inclusive:
        If (id[i] != white):
            Let k = i
            While (id[k] != k):
                k = id[k]
            End While
            id[i] = k
        End If
    End For

接下来,我们需要计算具有特定ID的单元格的数量:

    Clear idn[]

    For i = 0 to L*L-1, inclusive:
        Increment idn[id[i]]
    End For

由于我们对N次迭代中特定大小的簇的数量感兴趣,因此我们可以简单地通过在此迭代中找到的每个特定大小的簇来更新count数组:

    For i = 1 to L*L, inclusive:
        Increment count[idn[i]]
    End For
    Let count[0] = 0

End For

此时,count[i]包含通过N次迭代发现的i个单元的黑色簇的数量;换句话说,它是在N次迭代中看到的簇大小的直方图(离散分布).


上述的一种实现方式如下.

(第一个实现在数组大小和簇标签方面存在问题;此版本将该部分拆分到一个单独的文件中,并且更易于验证其正确性.不过,这只是第二个版本,因此我不考虑生产质量.)

首先,操作矩阵 cluster.h 的函数:

#ifndef   CLUSTER_H
#define   CLUSTER_H
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <time.h>

/*
   This is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

typedef  uint32_t  cluster_label;
typedef  uint32_t  cluster_size;

static size_t  rows = 0;    /* Number of mutable rows */
static size_t  cols = 0;    /* Number of mutable columns */
static size_t  nrows = 0;   /* Number of accessible rows == rows + 1 */
static size_t  ncols = 0;   /* Number of accessible columns == cols + 1 */
static size_t  cells = 0;   /* Total number of cells == nrows*ncols */
static size_t  empty = 0;   /* Highest-numbered label == nrows*ncols - 1 */
static size_t  sizes = 0;   /* Number of possible cluster sizes == rows*cols + 1 */
#define  INDEX(row, col)  (ncols*(row) + (col))

cluster_label   *label  = NULL;  /* 2D contents: label[ncols*(row) + (col)] */
cluster_size    *occurs = NULL;  /* Number of occurrences of each label value */

/* Xorshift64* PRNG used for generating the matrix.
*/
static uint64_t  prng_state = 1;

static inline uint64_t  prng_u64(void)
{
    uint64_t  state = prng_state;
    state ^= state >> 12;
    state ^= state << 25;
    state ^= state >> 27;
    prng_state = state;
    return state * UINT64_C(2685821657736338717);
}

static inline uint64_t  prng_u64_less(void)
{
    uint64_t  result;
    do {
        result = prng_u64();
    } while (result == UINT64_C(18446744073709551615));
    return result;
}

static inline uint64_t  prng_seed(const uint64_t  seed)
{
    if (seed)
        prng_state = seed;
    else
        prng_state = 1;

    return prng_state;
}

static inline uint64_t  prng_randomize(void)
{
    int       discard = 1024;
    uint64_t  seed;
    FILE     *in;

    /* By default, use time. */
    prng_seed( ((uint64_t)time(NULL) * 2832631) ^
               ((uint64_t)clock() * 1120939) );

#ifdef __linux__
    /* On Linux, read from /dev/urandom. */
    in = fopen("/dev/urandom", "r");
    if (in) {
        if (fread(&seed, sizeof seed, 1, in) == 1)
            prng_seed(seed);
        fclose(in);        
    }
#endif

    /* Churn the state a bit. */
    seed = prng_state;
    while (discard-->0) {
        seed ^= seed >> 12;
        seed ^= seed << 25;
        seed ^= seed >> 27;
    }
    return prng_state = seed;
}




static inline void cluster_free(void)
{
    free(occurs); occurs = NULL;
    free(label);  label = NULL;
    rows = 0;
    cols = 0;
    nrows = 0;
    ncols = 0;
    cells = 0;
    empty = 0;
    sizes = 0;
}


static int cluster_init(const size_t want_cols, const size_t want_rows)
{
    cluster_free();

    if (want_cols < 1 || want_rows < 1)
        return -1; /* Invalid number of rows or columns */

    rows = want_rows;
    cols = want_cols;

    nrows = rows + 1;
    ncols = cols + 1;

    cells = nrows * ncols;

    if ((size_t)(cells / nrows) != ncols ||
        nrows <= rows || ncols <= cols)
        return -1; /* Size overflows */

    empty = nrows * ncols - 1;

    sizes = rows * cols + 1;

    label  = calloc(cells, sizeof label[0]);
    occurs = calloc(cells, sizeof occurs[0]);
    if (!label || !occurs) {
        cluster_free();
        return -1; /* Not enough memory. */
    }

    return 0;
}


static void join2(size_t i1, size_t i2)
{
    size_t  root1 = i1, root2 = i2, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    root = root1;
    if (root > root2)
        root = root2;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }
}

static void join3(size_t i1, size_t i2, size_t i3)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }
}

static void join4(size_t i1, size_t i2, size_t i3, size_t i4)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root4 = i4, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    while (root4 != label[root4])
        root4 = label[root4];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;
    if (root > root4)
        root = root4;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }

    while (label[i4] != root) {
        const size_t  temp = label[i4];
        label[i4] = root;
        i4 = temp;
    }
}

static void cluster_fill(const uint64_t plimit)
{
    size_t  r, c;

    /* Fill the matrix with the specified probability.
       For efficiency, we use the same label for all
       horizontally consecutive cells.
    */
    for (r = 0; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        cluster_label last;
        size_t        i = ncols*r;

        last = (prng_u64_less() < plimit) ? i : empty;
        label[i] = last;

        while (++i < imax) {
            if (prng_u64_less() < plimit) {
                if (last == empty)
                    last = i;
            } else
                last = empty;

            label[i] = last;
        }

        label[imax] = empty;
    }

    /* Fill the extra row with empty, too. */
    {
        cluster_label       *ptr = label + ncols*rows;
        cluster_label *const end = label + ncols*nrows;
        while (ptr < end)
            *(ptr++) = empty;
    }

    /* On the very first row, we need to join non-empty cells
       vertically and diagonally down. */
    for (c = 0; c < cols; c++)
        switch ( ((label[c]         < empty) << 0) |
                 ((label[c+ncols]   < empty) << 1) |
                 ((label[c+ncols+1] < empty) << 2) ) {
            /*  <1>    *
             *   2  4  */
        case 3: /* Join down */
            join2(c, c+ncols); break;
        case 5: /* Join down right */
            join2(c, c+ncols+1); break;
        case 7: /* Join down and down right */
            join3(c, c+ncols, c+ncols+1); break;
        }

    /* On the other rows, we need to join non-empty cells
       vertically, diagonally down, and diagonally up. */
    for (r = 1; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        size_t        i;
        for (i = ncols*r; i < imax; i++)
            switch ( ((label[i]         < empty) << 0) |
                     ((label[i-ncols+1] < empty) << 1) |
                     ((label[i+ncols]   < empty) << 2) |
                     ((label[i+ncols+1] < empty) << 3) ) {
            /*      2  *
             *  <1>    *
             *   4  8  */
            case 3: /* Diagonally up */
                join2(i, i-ncols+1); break;
            case 5: /* Down */
                join2(i, i+ncols); break;
            case 7: /* Down and diagonally up */
                join3(i, i-ncols+1, i+ncols); break;
            case 9: /* Diagonally down */
                join2(i, i+ncols+1); break;
            case 11: /* Diagonally up and diagonally down */
                join3(i, i-ncols+1, i+ncols+1); break;
            case 13: /* Down and diagonally down */
                join3(i, i+ncols, i+ncols+1); break;
            case 15: /* Down, diagonally down, and diagonally up */
                join4(i, i-ncols+1, i+ncols, i+ncols+1); break;
            }
    }

    /* Flatten the labels, so that all cells belonging to the
       same cluster will have the same label. */
    {
        const size_t  imax = ncols*rows;
        size_t        i;
        for (i = 0; i < imax; i++)
            if (label[i] < empty) {
                size_t  k = i, kroot = i;

                while (kroot != label[kroot])
                    kroot = label[kroot];

                while (label[k] != kroot) {
                    const size_t  temp = label[k];
                    label[k] = kroot;
                    k = temp;
                }
            }
    }

    /* Count the number of occurrences of each label. */
    memset(occurs, 0, (empty + 1) * sizeof occurs[0]);
    {
        cluster_label *const end = label + ncols*rows;
        cluster_label       *ptr = label;

        while (ptr < end)
            ++occurs[*(ptr++)];
    }
}

#endif /* CLUSTER_H */

请注意,要获得完整的概率范围,prng_u64_less()函数绝不会返回最高的uint64_t.不过,这太过精确了.

main.c 解析输入参数,迭代并打印结果:

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t      *count = NULL;
    uint64_t       plimit;
    unsigned long  size, iterations, i;
    double         probability;
    char           dummy;

    if (argc != 4) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE ITERATIONS PROBABILITY\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       ITERATIONS   is the number of iterations, and\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lu %c", &iterations, &dummy) != 1 || iterations < 1u) {
        fprintf(stderr, "%s: Invalid number of iterations.\n", argv[2]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[3], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[3]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the cluster size histogram. */
    count = calloc(sizes + 1, sizeof count[0]);
    if (!count) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    printf("# Xorshift64* seed is %" PRIu64 "\n", prng_randomize());
    printf("# Matrix size is %lu x %lu\n", size, size);
    printf("# Probability of a cell to be non-empty is %.6f%%\n",
           100.0 * ((double)plimit / 18446744073709551615.0));
    printf("# Collecting statistics over %lu iterations\n", iterations);
    printf("# Size Count CountPerIteration\n");
    fflush(stdout);

    for (i = 0u; i < iterations; i++) {
        size_t  k = empty;

        cluster_fill(plimit);

        /* Add cluster sizes to the histogram. */
        while (k-->0)
            ++count[occurs[k]];
    }

    /* Print the histogram of cluster sizes. */
    {
        size_t k = 1;

        printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);

        for (k = 2; k < sizes; k++)
            if (count[k-1] != 0 || count[k] != 0 || count[k+1] != 0)
                printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);
    }

    return EXIT_SUCCESS;
}

程序以Gnuplot兼容的格式输出簇大小分布(直方图),从以#开头的几行注释行开始,以指示用于计算结果的确切参数.

输出中的第一列是群集大小(群集中的单元数),第二列是在迭代过程中找到的此类群集的确切计数,第三列是观察到的发生概率(即,总出现次数除以迭代次数.

这是L = 1000,N = 1000的簇大小分布,其中p的几个不同值如下: 请注意,两个轴都具有对数标度,因此它是对数-对数图.它是通过多次运行上述程序(对于每个唯一的p都运行一次)并将输出保存到文件(名称包含p的名称),然后将它们打印到Gnuplot中的单个图中来生成的.

为了验证聚类功能,我编写了一个测试程序,该程序生成一个矩阵,为每个聚类分配随机颜色,并将其输出为PPM图像(您可以使用NetPBM工具或基本上任何图像处理程序进行操作) ; ppm.c :

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t       plimit;
    uint32_t      *color;
    unsigned long  size, r, c;
    double         probability;
    char           dummy;

    if (argc != 3) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE PROBABILITY > matrix.ppm\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[2]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the color lookup array. */
    color = calloc(empty + 1, sizeof color[0]);
    if (!color) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    fprintf(stderr, "Using Xorshift64* seed %" PRIu64 "\n", prng_randomize());
    fflush(stderr);

    /* Construct the matrix. */
    cluster_fill(plimit);

    {
        size_t  i;

        color[empty] = 0xFFFFFFu;

        /* Assign random colors. */
        for (i = 0; i < empty; i++)
            color[i] = prng_u64() >> 40;
    }

    printf("P6\n%lu %lu 255\n", size, size);
    for (r = 0; r < rows; r++)
        for (c = 0; c < cols; c++) {
            const uint32_t  rgb = color[label[ncols*r + c]];
            fputc((rgb >> 16) & 255, stdout);
            fputc((rgb >> 8)  & 255, stdout);
            fputc( rgb        & 255, stdout);
        }
    fflush(stdout);

    fprintf(stderr, "Done.\n");

    return EXIT_SUCCESS;
}

这是p = 40%的256x256矩阵的样子:

请注意,白色背景不算作簇.

Consider a L x L size matrix M, whose entries can be either 0 or 1. Each element is 1 with probability p and 0 with probability 1 - p. I will call the elements labelled 1 as black elements and elements labelled 0 as white elements. I'm trying to write a code which:

  • Generates a random matrix with entries 0's and 1's. I need to input size of matrix L and the probability p.

  • Labels all the black elements belonging to the same cluster with the same number. (Define a cluster of black element as a maximal connected component in the graph of cells with the value of 1, where edges connect cells whose rows and columns both differ by at most 1 (so up to eight neighbours for each cell). In other words if two black elements of the matrix share an edge or a vertex consider them as belonging to the same black cluster. That is, think of the matrix as a large square and the elements as small squares in the large square.)

  • Within a loop that runs from p = 0 to p = 100 (I'm referring to probability p as a percentage): the number of black clusters of each size is written onto a file corresponding to that value of p.

For example:

Input: p = 30, L = 50

Output (which is written to a data file for each p; thus there are 101 data files created by this program, from p = 0 to p = 100):

1 100 (i.e. there are 100 black clusters of size 1)

2 20 (i.e. there are 20 black clusters of size 2)

3 15 (i.e. there are 15 black clusters of size 3)

and so on...

#include "stdafx.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <string.h>
int *labels;
int  n_labels = 0;

int uf_find(int x) {
    int y = x;
    while (labels[y] != y)
        y = labels[y];

    while (labels[x] != x) {
        int z = labels[x];
        labels[x] = y;
        x = z;
    }
    return y;
}

int uf_union(int x, int y) {
    return labels[uf_find(x)] = uf_find(y);
}

int uf_make_set(void) {
    labels[0] ++;
    assert(labels[0] < n_labels);
    labels[labels[0]] = labels[0];
    return labels[0];
}

void uf_initialize(int max_labels) {
    n_labels = max_labels;
    labels = (int*)calloc(sizeof(int), n_labels);
    labels[0] = 0;
}

void uf_done(void) {
    free(labels);
    labels = 0;
}

#define max(a,b) (a>b?a:b)
#define min(a,b) (a>b?b:a)

int hoshen_kopelman(int **matrix, int m, int n) {

    uf_initialize(m * n / 2);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {

                int up = (i == 0 ? 0 : matrix[i - 1][j]);
                int left = (j == 0 ? 0 : matrix[i][j - 1]);

                switch (!!up + !!left) {

                case 0:
                    matrix[i][j] = uf_make_set();
                    break;

                case 1:
                    matrix[i][j] = max(up, left);
                    break;

                case 2:
                    matrix[i][j] = uf_union(up, left);
                    break;
                }
                int north_west;
                if (i == 0 || j == 0)
                    north_west = 0;
                else
                    north_west = matrix[i - 1][j - 1];

                int north_east;
                if (i == 0 || j == (n - 1))
                    north_east = 0;
                else
                    north_east = matrix[i - 1][j + 1];

                if (!!north_west == 1)
                {
                    if (i != 0 && j != 0)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i][j - 1] == 0 && matrix[i - 1][j] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_west;
                                else
                                    uf_union(north_west, matrix[i][j]);
                            }
                        //}

                    }
                    else if (i == 0 || j == 0)
                    {

                    }

                }
                if (!!north_east == 1)
                {
                    if (i != 0 && j != n - 1)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i - 1][j] == 0 && matrix[i][j + 1] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_east;
                                else
                                    uf_union(north_east, matrix[i][j]);
                            }
                        //}
                    }
                    else if (i == 0 || j == n - 1)
                    {

                    }
                }
            }
    int *new_labels = (int*)calloc(sizeof(int), n_labels);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                int x = uf_find(matrix[i][j]);
                if (new_labels[x] == 0) {
                    new_labels[0]++;
                    new_labels[x] = new_labels[0];
                }
                matrix[i][j] = new_labels[x];
            }

    int total_clusters = new_labels[0];

    free(new_labels);
    uf_done();

    return total_clusters;
}

void check_labelling(int **matrix, int m, int n) {
    int N, S, E, W;
    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                N = (i == 0 ? 0 : matrix[i - 1][j]);
                S = (i == m - 1 ? 0 : matrix[i + 1][j]);
                E = (j == n - 1 ? 0 : matrix[i][j + 1]);
                W = (j == 0 ? 0 : matrix[i][j - 1]);

                assert(N == 0 || matrix[i][j] == N);
                assert(S == 0 || matrix[i][j] == S);
                assert(E == 0 || matrix[i][j] == E);
                assert(W == 0 || matrix[i][j] == W);
            }
}

int cluster_count(int **matrix, int size, int N)
{
    int i;
    int j;
    int count = 0;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            if (matrix[i][j] == N)
                count++;
        }
    }
    return count;
}

int main(int argc, char **argv)
{
    srand((unsigned int)time(0));
    int p = 0;
    printf("Enter number of rows/columns: ");
    int size = 0;
    scanf("%d", &size);
    printf("\n");
    FILE *fp;

    printf("Enter number of averaging iterations: ");
    int iterations = 0;

    scanf("%d", &iterations);

        for (int p = 0; p <= 100; p++)
        { 
            char str[100];
            sprintf(str, "BlackSizeDistribution%03i.txt", p);
            fp = fopen(str, "w");
            int **matrix;
            matrix = (int**)calloc(10, sizeof(int*));
            int** matrix_new = (int**)calloc(10, sizeof(int*));
            matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
            matrix = matrix_new;
        for (int i = 0; i < size; i++)
        {
            matrix[i] = (int *)calloc(size, sizeof(int));
            for (int j = 0; j < size; j++)
            {
                int z = rand() % 100;
                z = z + 1;
                if (p == 0)
                    matrix[i][j] = 0;
                if (z <= p)
                    matrix[i][j] = 1;
                else
                    matrix[i][j] = 0;
            }
        }

        hoshen_kopelman(matrix, size, size);
        int highest = matrix[0][0];
        for (int i = 0; i < size; i++)
            for (int j = 0; j < size; j++)
                if (highest < matrix[i][j])
                    highest = matrix[i][j];
        int* counter = (int*)calloc(sizeof(int*), highest + 1);
        int high = 0;
        for (int k = 1; k <= highest; k++)
        {
            counter[k] = cluster_count(matrix, size, k);
            if (counter[k] > high)
                high = counter[k];
        }
        int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

        for (int y = 1; y <= high; y++)
        {
           int count = 0;
           for (int z = 1; z <= highest; z++)
               if (counter[z] == y)
                   count++;
           size_distribution[y] = count;
        }

        for (int k = 1; k <= high; k++)
        {
            fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
            printf("\n%d\t%d", k, size_distribution[k]);
        }
        printf("\n");
        check_labelling(matrix, size, size);
        for (int i = 0; i < size; i++)
            free(matrix[i]);
        free(matrix);
        }
}

I used the data files outputted by this program to create bar graphs for each p ranging from 0 to 100, using Gnuplot.

Some sample graphs:

This one corresponds to input p = 3 and L = 100

This one corresponds to p = 90 and L = 100


Okay, so I suppose that was enough context for the actual question I had.

The program I wrote only outputs value for one 1 iteration per value of p. Since this program is for a scientific computing purpose I need more accurate values and for that I need to output "averaged" size distribution value; over say 50 or 100 iterations. I'm not exactly sure how to do the averaging.

To be more clear this is what I want:

Say the output for three different runs of the program gives me (for say p = 3, L = 100)

Size    Iteration 1    Iteration 2    Iteration 3  Averaged Value (Over 3 iterations)

1       150            170             180         167

2       10             20              15          18

3       1              2               1           1

4       0              0               1           0  
.
.
.

I want to do something similar, except that I would need to perform the averaging over 50 or 100 iterations to get accurate values for my mathematical model. By the way, I don't really need to output the values for each iteration i.e. Iteration 1, Iteration 2, Iteration 3, ... in the data file. All I need to "print" is the first column and the last column (which has the averaged values) in the above table.

Now, I would probably need to modify the main function for that to do the averaging.

Here's the main function:

int main(int argc, char **argv)
    {
        srand((unsigned int)time(0));
        int p = 0;
        printf("Enter number of rows/columns: ");
        int size = 0;
        scanf("%d", &size);
        printf("\n");
        FILE *fp;

        printf("Enter number of averaging iterations: ");
        int iterations = 0;

        scanf("%d", &iterations);

            for (int p = 0; p <= 100; p++)
            { 
                char str[100];
                sprintf(str, "BlackSizeDistribution%03i.txt", p);
                fp = fopen(str, "w");
                int **matrix;
                matrix = (int**)calloc(10, sizeof(int*));
                int** matrix_new = (int**)calloc(10, sizeof(int*));
                matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
                matrix = matrix_new;
            for (int i = 0; i < size; i++)
            {
                matrix[i] = (int *)calloc(size, sizeof(int));
                for (int j = 0; j < size; j++)
                {
                    int z = rand() % 100;
                    z = z + 1;
                    if (p == 0)
                        matrix[i][j] = 0;
                    if (z <= p)
                        matrix[i][j] = 1;
                    else
                        matrix[i][j] = 0;
                }
            }

            hoshen_kopelman(matrix, size, size);
            int highest = matrix[0][0];
            for (int i = 0; i < size; i++)
                for (int j = 0; j < size; j++)
                    if (highest < matrix[i][j])
                        highest = matrix[i][j];
            int* counter = (int*)calloc(sizeof(int*), highest + 1);
            int high = 0;
            for (int k = 1; k <= highest; k++)
            {
                counter[k] = cluster_count(matrix, size, k);
                if (counter[k] > high)
                    high = counter[k];
            }
            int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

            for (int y = 1; y <= high; y++)
            {
               int count = 0;
               for (int z = 1; z <= highest; z++)
                   if (counter[z] == y)
                       count++;
               size_distribution[y] = count;
            }

            for (int k = 1; k <= high; k++)
            {
                fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
                printf("\n%d\t%d", k, size_distribution[k]);
            }
            printf("\n");
            check_labelling(matrix, size, size);
            for (int i = 0; i < size; i++)
                free(matrix[i]);
            free(matrix);
            }
    }

One of the ways I thought of was declaring a 2D array of size (largest size of black cluster possible for that) x (number of averaging iterations I want), for each run of the p-loop, and at the end of the program extract the average size distribution for each p from the 2D array. The largest size of black cluster possible in a matrix of size 100 x 100 (say) is 10,000. But for say smaller values of p (say p = 1, p = 20, ...) such large size clusters are never even produced! So creating such a large 2D array in the beginning would be a terrible wastage of memory space and it would take days to execute! (Please keep in mind that I need to run this program for L = 1000 and L = 5000 and if possible even L = 10000 for my purpose)

There must be some other more efficient way to do this. But I don't know what. Any help is appreciated.

解决方案

Ah, a topic very close to my own heart! :)

Because you are collecting statistics, and the exact configuration is only interesting for the duration of collecting said statistics, you only need two L×L matrixes: one to hold the color information (one white, 0 to L×L blacks, so a type that can hold an integer between 0 and L2+1, inclusive); and the other to hold the number of elements of that type (an integer between 0 and L2, inclusive).

To hold the number of black clusters of each size for each L and p value, across many iterations, you'll need a third array of L×L+1 elements; but do note that its values can reach N×L×L, if you do N iterations using the same L and p.

The standard C pseudorandom number generator is horrible; do not use that. Use either Mersenne Twister or my favourite, Xorshift64* (or the related Xorshift1024*). While Xorshift64* is extremely fast, its distribution is sufficiently "random" for simulations such as yours.

Instead of storing just "black" or "white", store either "uninteresting", or a cluster ID; initially, all (unconnected) blacks have an unique cluster ID. This way, you can implement a disjoint set and join the connected cells to clusters very efficiently.

Here is how I would implement this, in pseudocode:

Let id[L*L]        be the color information; index is L*row+col
Let idcount[L*L+1] be the id count over one iteration
Let count[L*L+1]   be the number of clusters across N iterations

Let limit = (p / 100.0) * PRNG_MAX

Let white = L*L    (and blacks are 0 to L*L-1, inclusive)
Clear count[] to all zeros

For iter = 1 to N, inclusive:

    For row = 0 to L-1, inclusive:

        If PRNG() <= limit:
            Let id[L*row] = L*row
        Else:
            Let id[L*row] = white
        End If

        For col = 1 to L-1, inclusive:
            If PRNG() <= limit:
                If id[L*row+col-1] == white:
                    Let id[L*row+col] = L*row + col
                Else:
                    Let id[L*row+col] = id[L*row+col-1]
                End If
            Else:
                Let id[L*row+col] = white
            End If
        End For

    End For

Note that I do the horizontal join pass when generating the cell ID's, simply by reusing the same ID for consecutive black cells. At this point, id[][] is now filled with black cells of various IDs at a probability of p percent, if PRNG() returns an unsigned integer between 0 and PRNG_MAX, inclusive.

Next, you do a pass of joins. Because horizontally connected cells are already joined, you need to do vertical, and the two diagonals. Do note that you should use path flattening when joining, for efficiency.

    For row = 0 to L-2, inclusive:
        For col = 0 to L-1, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col] != white):
                Join id[L*row+col] and id[L*row+L+col]
            End If
        End For
    End For

    For row = 0 to L-2, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col+1] != white):
                Join id[L*row+col] and id[L*row+L+col+1]
            End If
        End For
    End For

    For row = 1 to L-1, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row-L+col+1] != white):
                Join id[L*row+col] and id[L*row-L+col+1]
            End If
        End For
    End For

Of course, you should combine loops, to maintain best possible locality of access. (Do col == 0 separately, and row == 0 in a separate inner loop, minimizing if clauses, as those tend to be slow.) You could even make the id array (L+1)2 in size, filling the outer edge of cells with white, to simplify the above joins into a single double loop, at a cost of 4L+4 extra cells used.

At this point, you need to flatten each disjoint set. Each ID value that is not white is either L*row+col (meaning "this"), or a reference to another cell. Flattening means that for each ID, we find the final ID in the chain, and use that instead:

    For i = 0 to L*L-1, inclusive:
        If (id[i] != white):
            Let k = i
            While (id[k] != k):
                k = id[k]
            End While
            id[i] = k
        End If
    End For

Next, we need to count the number of cells that have a specific ID:

    Clear idn[]

    For i = 0 to L*L-1, inclusive:
        Increment idn[id[i]]
    End For

Because we are interested in the number of clusters of specific size over N iterations, we can simply update the count array by each cluster of specific size found in this iteration:

    For i = 1 to L*L, inclusive:
        Increment count[idn[i]]
    End For
    Let count[0] = 0

End For

At this point count[i] contains the number of black clusters of i cells found over N iterations; in other words, it is the histogram (discrete distribution) of the cluster sizes seen during the N iterations.


One implementation of the above could be as follows.

(The very first implementation had issues with the array sizes and cluster labels; this version splits that part into a separate file, and is easier to verify for correctness. Still, this is just the second version, so I do not consider it production-quality.)

First, the functions manipulating the matrix, cluster.h:

#ifndef   CLUSTER_H
#define   CLUSTER_H
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <time.h>

/*
   This is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

typedef  uint32_t  cluster_label;
typedef  uint32_t  cluster_size;

static size_t  rows = 0;    /* Number of mutable rows */
static size_t  cols = 0;    /* Number of mutable columns */
static size_t  nrows = 0;   /* Number of accessible rows == rows + 1 */
static size_t  ncols = 0;   /* Number of accessible columns == cols + 1 */
static size_t  cells = 0;   /* Total number of cells == nrows*ncols */
static size_t  empty = 0;   /* Highest-numbered label == nrows*ncols - 1 */
static size_t  sizes = 0;   /* Number of possible cluster sizes == rows*cols + 1 */
#define  INDEX(row, col)  (ncols*(row) + (col))

cluster_label   *label  = NULL;  /* 2D contents: label[ncols*(row) + (col)] */
cluster_size    *occurs = NULL;  /* Number of occurrences of each label value */

/* Xorshift64* PRNG used for generating the matrix.
*/
static uint64_t  prng_state = 1;

static inline uint64_t  prng_u64(void)
{
    uint64_t  state = prng_state;
    state ^= state >> 12;
    state ^= state << 25;
    state ^= state >> 27;
    prng_state = state;
    return state * UINT64_C(2685821657736338717);
}

static inline uint64_t  prng_u64_less(void)
{
    uint64_t  result;
    do {
        result = prng_u64();
    } while (result == UINT64_C(18446744073709551615));
    return result;
}

static inline uint64_t  prng_seed(const uint64_t  seed)
{
    if (seed)
        prng_state = seed;
    else
        prng_state = 1;

    return prng_state;
}

static inline uint64_t  prng_randomize(void)
{
    int       discard = 1024;
    uint64_t  seed;
    FILE     *in;

    /* By default, use time. */
    prng_seed( ((uint64_t)time(NULL) * 2832631) ^
               ((uint64_t)clock() * 1120939) );

#ifdef __linux__
    /* On Linux, read from /dev/urandom. */
    in = fopen("/dev/urandom", "r");
    if (in) {
        if (fread(&seed, sizeof seed, 1, in) == 1)
            prng_seed(seed);
        fclose(in);        
    }
#endif

    /* Churn the state a bit. */
    seed = prng_state;
    while (discard-->0) {
        seed ^= seed >> 12;
        seed ^= seed << 25;
        seed ^= seed >> 27;
    }
    return prng_state = seed;
}




static inline void cluster_free(void)
{
    free(occurs); occurs = NULL;
    free(label);  label = NULL;
    rows = 0;
    cols = 0;
    nrows = 0;
    ncols = 0;
    cells = 0;
    empty = 0;
    sizes = 0;
}


static int cluster_init(const size_t want_cols, const size_t want_rows)
{
    cluster_free();

    if (want_cols < 1 || want_rows < 1)
        return -1; /* Invalid number of rows or columns */

    rows = want_rows;
    cols = want_cols;

    nrows = rows + 1;
    ncols = cols + 1;

    cells = nrows * ncols;

    if ((size_t)(cells / nrows) != ncols ||
        nrows <= rows || ncols <= cols)
        return -1; /* Size overflows */

    empty = nrows * ncols - 1;

    sizes = rows * cols + 1;

    label  = calloc(cells, sizeof label[0]);
    occurs = calloc(cells, sizeof occurs[0]);
    if (!label || !occurs) {
        cluster_free();
        return -1; /* Not enough memory. */
    }

    return 0;
}


static void join2(size_t i1, size_t i2)
{
    size_t  root1 = i1, root2 = i2, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    root = root1;
    if (root > root2)
        root = root2;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }
}

static void join3(size_t i1, size_t i2, size_t i3)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }
}

static void join4(size_t i1, size_t i2, size_t i3, size_t i4)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root4 = i4, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    while (root4 != label[root4])
        root4 = label[root4];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;
    if (root > root4)
        root = root4;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }

    while (label[i4] != root) {
        const size_t  temp = label[i4];
        label[i4] = root;
        i4 = temp;
    }
}

static void cluster_fill(const uint64_t plimit)
{
    size_t  r, c;

    /* Fill the matrix with the specified probability.
       For efficiency, we use the same label for all
       horizontally consecutive cells.
    */
    for (r = 0; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        cluster_label last;
        size_t        i = ncols*r;

        last = (prng_u64_less() < plimit) ? i : empty;
        label[i] = last;

        while (++i < imax) {
            if (prng_u64_less() < plimit) {
                if (last == empty)
                    last = i;
            } else
                last = empty;

            label[i] = last;
        }

        label[imax] = empty;
    }

    /* Fill the extra row with empty, too. */
    {
        cluster_label       *ptr = label + ncols*rows;
        cluster_label *const end = label + ncols*nrows;
        while (ptr < end)
            *(ptr++) = empty;
    }

    /* On the very first row, we need to join non-empty cells
       vertically and diagonally down. */
    for (c = 0; c < cols; c++)
        switch ( ((label[c]         < empty) << 0) |
                 ((label[c+ncols]   < empty) << 1) |
                 ((label[c+ncols+1] < empty) << 2) ) {
            /*  <1>    *
             *   2  4  */
        case 3: /* Join down */
            join2(c, c+ncols); break;
        case 5: /* Join down right */
            join2(c, c+ncols+1); break;
        case 7: /* Join down and down right */
            join3(c, c+ncols, c+ncols+1); break;
        }

    /* On the other rows, we need to join non-empty cells
       vertically, diagonally down, and diagonally up. */
    for (r = 1; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        size_t        i;
        for (i = ncols*r; i < imax; i++)
            switch ( ((label[i]         < empty) << 0) |
                     ((label[i-ncols+1] < empty) << 1) |
                     ((label[i+ncols]   < empty) << 2) |
                     ((label[i+ncols+1] < empty) << 3) ) {
            /*      2  *
             *  <1>    *
             *   4  8  */
            case 3: /* Diagonally up */
                join2(i, i-ncols+1); break;
            case 5: /* Down */
                join2(i, i+ncols); break;
            case 7: /* Down and diagonally up */
                join3(i, i-ncols+1, i+ncols); break;
            case 9: /* Diagonally down */
                join2(i, i+ncols+1); break;
            case 11: /* Diagonally up and diagonally down */
                join3(i, i-ncols+1, i+ncols+1); break;
            case 13: /* Down and diagonally down */
                join3(i, i+ncols, i+ncols+1); break;
            case 15: /* Down, diagonally down, and diagonally up */
                join4(i, i-ncols+1, i+ncols, i+ncols+1); break;
            }
    }

    /* Flatten the labels, so that all cells belonging to the
       same cluster will have the same label. */
    {
        const size_t  imax = ncols*rows;
        size_t        i;
        for (i = 0; i < imax; i++)
            if (label[i] < empty) {
                size_t  k = i, kroot = i;

                while (kroot != label[kroot])
                    kroot = label[kroot];

                while (label[k] != kroot) {
                    const size_t  temp = label[k];
                    label[k] = kroot;
                    k = temp;
                }
            }
    }

    /* Count the number of occurrences of each label. */
    memset(occurs, 0, (empty + 1) * sizeof occurs[0]);
    {
        cluster_label *const end = label + ncols*rows;
        cluster_label       *ptr = label;

        while (ptr < end)
            ++occurs[*(ptr++)];
    }
}

#endif /* CLUSTER_H */

Note that to get full probability range, the prng_u64_less() function never returns the highest possible uint64_t. It is way overkill, precision-wise, though.

The main.c parses the input parameters, iterates, and prints the results:

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t      *count = NULL;
    uint64_t       plimit;
    unsigned long  size, iterations, i;
    double         probability;
    char           dummy;

    if (argc != 4) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE ITERATIONS PROBABILITY\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       ITERATIONS   is the number of iterations, and\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lu %c", &iterations, &dummy) != 1 || iterations < 1u) {
        fprintf(stderr, "%s: Invalid number of iterations.\n", argv[2]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[3], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[3]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the cluster size histogram. */
    count = calloc(sizes + 1, sizeof count[0]);
    if (!count) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    printf("# Xorshift64* seed is %" PRIu64 "\n", prng_randomize());
    printf("# Matrix size is %lu x %lu\n", size, size);
    printf("# Probability of a cell to be non-empty is %.6f%%\n",
           100.0 * ((double)plimit / 18446744073709551615.0));
    printf("# Collecting statistics over %lu iterations\n", iterations);
    printf("# Size Count CountPerIteration\n");
    fflush(stdout);

    for (i = 0u; i < iterations; i++) {
        size_t  k = empty;

        cluster_fill(plimit);

        /* Add cluster sizes to the histogram. */
        while (k-->0)
            ++count[occurs[k]];
    }

    /* Print the histogram of cluster sizes. */
    {
        size_t k = 1;

        printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);

        for (k = 2; k < sizes; k++)
            if (count[k-1] != 0 || count[k] != 0 || count[k+1] != 0)
                printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);
    }

    return EXIT_SUCCESS;
}

The program outputs the cluster size distribution (histogram) in Gnuplot-compatible format, beginning with a few comment lines starting with # to indicate the exact parameters used to compute the results.

The first column in the output is the cluster size (number of cells in a cluster), the second column is the exact count of such clusters found during the iterations, and the third column is the observed probability of occurrence (i.e., total count of occurrences divided by the number of iterations).

Here is what the cluster size distribution for L=1000, N=1000 looks like for a few different values of p: Note that both axes have logarithmic scaling, so it is a log-log plot. It was generated by running the above program a few times, once for each unique p, and saving the output to a file (name containing the value of p), then printing them into a single plot in Gnuplot.

For verification of the clustering functions, I wrote a test program that generates one matrix, assigns random colors to each cluster, and outputs it as an PPM image (that you can manipulate with NetPBM tools, or basically any image manipulation program); ppm.c:

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t       plimit;
    uint32_t      *color;
    unsigned long  size, r, c;
    double         probability;
    char           dummy;

    if (argc != 3) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE PROBABILITY > matrix.ppm\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[2]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the color lookup array. */
    color = calloc(empty + 1, sizeof color[0]);
    if (!color) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    fprintf(stderr, "Using Xorshift64* seed %" PRIu64 "\n", prng_randomize());
    fflush(stderr);

    /* Construct the matrix. */
    cluster_fill(plimit);

    {
        size_t  i;

        color[empty] = 0xFFFFFFu;

        /* Assign random colors. */
        for (i = 0; i < empty; i++)
            color[i] = prng_u64() >> 40;
    }

    printf("P6\n%lu %lu 255\n", size, size);
    for (r = 0; r < rows; r++)
        for (c = 0; c < cols; c++) {
            const uint32_t  rgb = color[label[ncols*r + c]];
            fputc((rgb >> 16) & 255, stdout);
            fputc((rgb >> 8)  & 255, stdout);
            fputc( rgb        & 255, stdout);
        }
    fflush(stdout);

    fprintf(stderr, "Done.\n");

    return EXIT_SUCCESS;
}

Here is what a 256x256 matrix at p=40% looks like:

Note that the white background is not counted as a cluster.

这篇关于在无需声明巨大的2D数组的情况下找到平均尺寸分​​布的有效方法是什么?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆