检测地理群集 [英] Detecting geographic clusters

查看:143
本文介绍了检测地理群集的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我有一个包含经度,纬度的R data.frame,它横跨整个美国地图。当X个条目都在一个小的地理区域内,例如几度经度&几度纬度,我希望能够检测到这一点,然后让我的程序返回地理边界框的坐标。是否有Python或R CRAN包已经这样做?如果没有,我该如何确定这些信息? 我能够将Joran的答案和Dan H的评论结合起来。这是一个输出示例:



python代码为R:map()和rect()发出函数。这个美国示例地图是用以下方式创建的:

  map('state',plot = TRUE,fill = FALSE,col = palette( ))

然后您可以在R GUI解释器中相应地使用rect() (见下面)。

pre $ 从集合中导入数学
import defaultdict

to_rad = math .pi / 180.0#将lat或lng转换为弧度
fname =site.tsv#文件格式:LAT\tLONG
threshhold_dist = 50#根据需要调整
threshhold_locations = 15#群集中需要的最少位置数

def dist(lat1,lng1,lat2,lng2):
全局to_rad
earth_radius_km = 6371

dLat =(lat2-lat1)* to_rad
dLon =(lng2-lng1)* to_rad
lat1_rad = lat1 * to_rad
lat2_rad = lat2 * to_rad

a = math。 sin(dLat / 2)* math.sin(dLat / 2)+ math.sin(dLon / 2)* math.sin(dLon / 2)* math.cos(lat1_rad)* math.cos(lat2_rad)
c = 2 * m ath.atan2(math.sqrt(a),math.sqrt(1-a));
dist = earth_radius_km * c
return dist

def bounding_box(src,neighbors):
neighbors.append(src)
#nw = NorthWest se =东南
nw_lat = -360
nw_lng = 360
se_lat = 360
se_lng = -360

(y,x)邻居:
if y> nw_lat:nw_lat = y
如果x> se_lng:se_lng = x

if y< se_lat:se_lat = y
如果x < nw_lng:nw_lng = x

#添加一些填充物
pad = 0.5
nw_lat + =填充物
nw_lng - =填充物
se_lat - =填充物
se_lng + = pad

#适用于r的map()函数
return(se_lat,nw_lat,nw_lng,se_lng)
$ b $ def sitesDist(site1,site2 ):
#只是一个助手在
下面的短列表理解返回dist(site1 [0],site1 [1],site2 [0],site2 [1])$ ​​b
$ b def load_site_data():
全局fname
sites = defaultdict(元组)
$ b $ data = open(fname,encoding =latin-1)
data.readline ()#skip header
for line in data:
line = line [: - 1]
slots = line.split(\ t)
lat = float( slot [0])
lng = float(slots [1])$ ​​b $ b lat_rad = lat * math.pi / 180.0
lng_rad = lng * math.pi / 180.0
sites [ (lat,lng)] =(lat,lng)#(lat_rad,lng_rad)
返回地点

def main():
sites_dict = {}
sites = load_site_data()
用于站点中的站点:$ b​​ $ b#为每个站点将其放入字典中,其值为邻居数组
sites_dict [site] = [x for site in site如果x!= site和sitesDist(site,x)< threshhold_dist]

结果= {}
用于网站站点:$ b​​ $ bj = len(sites_dict [网站])
如果j> = threshhold_locations:
bbox结果中的coord = bounding_box(site,sites_dict [site])
results [coord] = coord


yx =ylim = c(%s,%s ),xlim = c(%s,%s)%(results [bbox])#(se_lat,nw_lat,nw_lng,se_lng)
print('map(county,plot = T,fill = T ,col = palette(),%s)'%yx)
rect ='rect(%s,%s,%s,%s,col = c(red))'%(results [bbox [b]] [2],结果[bbox] [0],结果[bbox] [3],结果[bbox] [2])
print(rect)
print()

main()

以下是一个TSV文件示例(site.tsv)

  LAT LONG 
36.3312 -94.1334
36.6828 -121.791
37.2307 -121.96
37.3857 - 122.026
37.3857 -122.026
37.3857 -122.026
37.3895 -97.644
37.3992 -122.139
37.3992 -122.139
37.402 -122 .078
37.402 -122.078
37.402 -122.078
37.402 -122.078
37.402 -122.078
37.48 -122.144
37.48 -122.144
37.55 126.967

使用我的数据集,我的python脚本的输出显示在美国地图上。

  rect(-74.989,39.7667,-73.0419,41.5209,col = c(red ))
rect(-123.005,36.8144,-121.392,38.3672,col = c(green))
rect(-78.2422,38.2474,-76.3,39.9282,col = c(blue ))






添加于2013-05-01 for Yacob






这两行为您提供了全面的目标......

  map(county,plot = T)
rect(-122.644,36.7307,-121.46,37.98,col = c(red))

如果您想缩小地图的一部分,可以使用 ylim xlim

  map(county ,plot = T,ylim = c(36.7307,37.98),xlim = c(-122.644,-121.46))
#或者用于更多着色,但选择其中一个或另一个map(country)命令
map(county,plot = T,fill = T,col = palette(),ylim = c(36.7307,37.98),xlim = c(-122.644,-121.46))
rect( - 122.644,36.7307,-121.46,37.98,col = c(red))

您将要使用'世界'地图...

  map(world,plot = T)

我已经使用了下面发布的python代码很久了,所以我将尽我所能来帮助你。

  threshhold_dist是边界框的大小,即:地理区域
theshhold_location是在
边界框中所需的经纬度/点数,以便它被视为一个群集。

这是一个完整的例子。 TSV文件位于pastebin.com上。我还包含一个由R生成的图像,其中包含所有rect()命令的输出。

 #pyclusters.py 
#May-02-2013
#-John Taylor

#latlng.tsv位于http://pastebin.com/cyvEdx3V
#使用 RAW Paste Data来保存制表符

从集合中导入数学
import defaultdict

#另请参见:http://www.geomidpoint.com/example .html
#参见:http://www.movable-type.co.uk/scripts/latlong.html

to_rad = math.pi / 180.0#convert lat or lng to弧度
fname =latlng.tsv#文件格式:LAT\tLONG
threshhold_dist = 20#根据您的需要调整
threshhold_locations = 20#群集中最少需要的位置数量$ b (经纬度):
x = math.cos(lat) * math.sin(lng)
z = math.sin(lat)
return(x,y,z)

def cart2corrd(x,y ,z):
lon = math.atan2(y,x)
hyp = math.sqrt(x * x + y * y)
lat = math.atan2(z,hyp)
return(lat,lng)

def dist(lat1,lng1,lat2,lng2):
全局to_rad,earth_radius_km

dLat =(lat2 -lat1)* to_rad
dLon =(lng2-lng1)* to_rad
lat1_rad = lat1 * to_rad
lat2_rad = lat2 * to_rad

a = math.sin(dLat / 2)* math.sin(dLat / 2)+ math.sin(dLon / 2)* math.sin(dLon / 2)* math.cos(lat1_rad)* math.cos(lat2_rad)
c = 2 * math.atan2(math.sqrt(a),math.sqrt(1-a));
dist = earth_radius_km * c
return dist

def bounding_box(src,neighbors):
neighbors.append(src)
#nw = NorthWest se =东南
nw_lat = -360
nw_lng = 360
se_lat = 360
se_lng = -360

(y,x)邻居:
if y> nw_lat:nw_lat = y
如果x> se_lng:se_lng = x

if y< se_lat:se_lat = y
如果x < nw_lng:nw_lng = x

#添加一些填充物
pad = 0.5
nw_lat + =填充物
nw_lng - =填充物
se_lat - =填充物
se_lng + = pad

#print(answer:)
#print(nw lat,lng:%s%s%(nw_lat,nw_lng))$ b $适用于r的map()函数
return(se_lat,nw_lat,nw_lng,%s,%s,%s)%b
$ se_lng)

def sitesDist(site1,site2):
#只是一个帮助短缺的列表comprehensioin低于
return dist(site1 [0],site1 [1],site2 [ 0),site2 [1])$ ​​b
$ b def load_site_data():
全局fname $ b $ sites = defaultdict(元组)
$ b data = open(fname ,编码=latin-1)
data.readline()#跳过标题
用于数据中的行:
line =行[: - 1]
slots =行。 split(\ t)
lat = float(slots [0])
lng = float(slots [1])$ ​​b $ b lat_rad = lat * math.pi / 180.0
lng_rad = lng *数学.pi / 180.0
sites [(lat,lng)] =(lat,lng)#(lat_rad,lng_rad)
返回地点

def main():
color_list =(red,blue,green,yellow,orange,brown,pink,purple)
color_idx = 0
sites_dict = { }
sites = load_site_data()
为站点中的站点:$ b​​ $ b#为每个站点放在一个词典中,它的值是一个邻居数组
sites_dict [site] = [ x for site in if if!= site and sitesDist(site,x)< threshhold_dist]
$ b print()
print('map(state,plot = T)')#或使用:县而不是州
print( )


结果= {}
在站点中的站点:$ b​​ $ bj = len(sites_dict [site])
如果j> = threshhold_locations:
$ co $ = b






$ ,%s),xlim = c(%s,%s)%(results [bbox])#(se_lat,nw_lat,nw_lng,se_lng)

#important!
#如果你想为每个群集分配一个映射,取消注释这一行
#print('map(county,plot = T,fill = T,col = palette(),%s)' %yx)
len(color_list)== color_idx:
color_idx = 0
rect ='rect(%s,%s,%s,%s,col = c(%yx)结果[bbox] [2],结果[bbox] [0],结果[bbox] [3],结果[bbox] [1],颜色列表[color_idx])
color_idx + = 1
print(rect)
print()


main()


I have a R data.frame containing longitude, latitude which spans over the entire USA map. When X number of entries are all within a small geographic region of say a few degrees longitude & a few degrees latitude, I want to be able to detect this and then have my program then return the coordinates for the geographic bounding box. Is there a Python or R CRAN package that already does this? If not, how would I go about ascertaining this information?

解决方案

I was able to combine Joran's answer along with Dan H's comment. This is an example ouput:

The python code emits functions for R: map() and rect(). This USA example map was created with:

map('state', plot = TRUE, fill = FALSE, col = palette())

and then you can apply the rect()'s accordingly from with in the R GUI interpreter (see below).

import math
from collections import defaultdict

to_rad = math.pi / 180.0   # convert lat or lng to radians
fname = "site.tsv"        # file format: LAT\tLONG
threshhold_dist=50         # adjust to your needs
threshhold_locations=15    # minimum # of locations needed in a cluster

def dist(lat1,lng1,lat2,lng2):
    global to_rad
    earth_radius_km = 6371

    dLat = (lat2-lat1) * to_rad
    dLon = (lng2-lng1) * to_rad
    lat1_rad = lat1 * to_rad
    lat2_rad = lat2 * to_rad

    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1_rad) * math.cos(lat2_rad)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)); 
    dist = earth_radius_km * c
    return dist

def bounding_box(src, neighbors):
    neighbors.append(src)
    # nw = NorthWest se=SouthEast
    nw_lat = -360
    nw_lng = 360
    se_lat = 360
    se_lng = -360

    for (y,x) in neighbors:
        if y > nw_lat: nw_lat = y
        if x > se_lng: se_lng = x

        if y < se_lat: se_lat = y
        if x < nw_lng: nw_lng = x

    # add some padding
    pad = 0.5
    nw_lat += pad
    nw_lng -= pad
    se_lat -= pad
    se_lng += pad

    # sutiable for r's map() function
    return (se_lat,nw_lat,nw_lng,se_lng)

def sitesDist(site1,site2): 
    #just a helper to shorted list comprehension below 
    return dist(site1[0],site1[1], site2[0], site2[1])

def load_site_data():
    global fname
    sites = defaultdict(tuple)

    data = open(fname,encoding="latin-1")
    data.readline() # skip header
    for line in data:
        line = line[:-1]
        slots = line.split("\t")
        lat = float(slots[0])
        lng = float(slots[1])
        lat_rad = lat * math.pi / 180.0
        lng_rad = lng * math.pi / 180.0
        sites[(lat,lng)] = (lat,lng) #(lat_rad,lng_rad)
    return sites

def main():
    sites_dict = {}
    sites = load_site_data()
    for site in sites: 
        #for each site put it in a dictionary with its value being an array of neighbors 
        sites_dict[site] = [x for x in sites if x != site and sitesDist(site,x) < threshhold_dist] 

    results = {}
    for site in sites: 
        j = len(sites_dict[site])
        if j >= threshhold_locations:
            coord = bounding_box( site, sites_dict[site] )
            results[coord] = coord

    for bbox in results:
        yx="ylim=c(%s,%s), xlim=c(%s,%s)" % (results[bbox]) #(se_lat,nw_lat,nw_lng,se_lng)
        print('map("county", plot=T, fill=T, col=palette(), %s)' % yx)
        rect='rect(%s,%s, %s,%s, col=c("red"))' % (results[bbox][2], results[bbox][0], results[bbox][3], results[bbox][2])
        print(rect)
        print("")

main()

Here is an example TSV file (site.tsv)

LAT     LONG
36.3312 -94.1334
36.6828 -121.791
37.2307 -121.96
37.3857 -122.026
37.3857 -122.026
37.3857 -122.026
37.3895 -97.644
37.3992 -122.139
37.3992 -122.139
37.402  -122.078
37.402  -122.078
37.402  -122.078
37.402  -122.078
37.402  -122.078
37.48   -122.144
37.48   -122.144
37.55   126.967

With my data set, the output of my python script, shown on the USA map. I changed the colors for clarity.

rect(-74.989,39.7667, -73.0419,41.5209, col=c("red"))
rect(-123.005,36.8144, -121.392,38.3672, col=c("green"))
rect(-78.2422,38.2474, -76.3,39.9282, col=c("blue"))


Addition on 2013-05-01 for Yacob


These 2 lines give you the over all goal...

map("county", plot=T )
rect(-122.644,36.7307, -121.46,37.98, col=c("red"))

If you want to narrow in on a portion of a map, you can use ylim and xlim

map("county", plot=T, ylim=c(36.7307,37.98), xlim=c(-122.644,-121.46))
# or for more coloring, but choose one or the other map("country") commands
map("county", plot=T, fill=T, col=palette(), ylim=c(36.7307,37.98), xlim=c(-122.644,-121.46))
rect(-122.644,36.7307, -121.46,37.98, col=c("red"))

You will want to use the 'world' map...

map("world", plot=T )

It has been a long time since I have used this python code I have posted below so I will try my best to help you.

threshhold_dist is the size of the bounding box, ie: the geographical area
theshhold_location is the number of lat/lng points needed with in
    the bounding box in order for it to be considered a cluster.

Here is a complete example. The TSV file is located on pastebin.com. I have also included an image generated from R that contains the output of all of the rect() commands.

# pyclusters.py
# May-02-2013
# -John Taylor

# latlng.tsv is located at http://pastebin.com/cyvEdx3V
# use the "RAW Paste Data" to preserve the tab characters

import math
from collections import defaultdict

# See also: http://www.geomidpoint.com/example.html
# See also: http://www.movable-type.co.uk/scripts/latlong.html

to_rad = math.pi / 180.0  # convert lat or lng to radians
fname = "latlng.tsv"      # file format: LAT\tLONG
threshhold_dist=20        # adjust to your needs
threshhold_locations=20   # minimum # of locations needed in a cluster
earth_radius_km = 6371

def coord2cart(lat,lng):
    x = math.cos(lat) * math.cos(lng)
    y = math.cos(lat) * math.sin(lng)
    z = math.sin(lat)
    return (x,y,z)

def cart2corrd(x,y,z):
    lon = math.atan2(y,x)
    hyp = math.sqrt(x*x + y*y)
    lat = math.atan2(z,hyp)
    return(lat,lng)

def dist(lat1,lng1,lat2,lng2):
    global to_rad, earth_radius_km

    dLat = (lat2-lat1) * to_rad
    dLon = (lng2-lng1) * to_rad
    lat1_rad = lat1 * to_rad
    lat2_rad = lat2 * to_rad

    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1_rad) * math.cos(lat2_rad)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)); 
    dist = earth_radius_km * c
    return dist

def bounding_box(src, neighbors):
    neighbors.append(src)
    # nw = NorthWest se=SouthEast
    nw_lat = -360
    nw_lng = 360
    se_lat = 360
    se_lng = -360

    for (y,x) in neighbors:
        if y > nw_lat: nw_lat = y
        if x > se_lng: se_lng = x

        if y < se_lat: se_lat = y
        if x < nw_lng: nw_lng = x

    # add some padding
    pad = 0.5
    nw_lat += pad
    nw_lng -= pad
    se_lat -= pad
    se_lng += pad

    #print("answer:")
    #print("nw lat,lng : %s %s" % (nw_lat,nw_lng))
    #print("se lat,lng : %s %s" % (se_lat,se_lng))

    # sutiable for r's map() function
    return (se_lat,nw_lat,nw_lng,se_lng)

def sitesDist(site1,site2): 
    # just a helper to shorted list comprehensioin below 
    return dist(site1[0],site1[1], site2[0], site2[1])

def load_site_data():
    global fname
    sites = defaultdict(tuple)

    data = open(fname,encoding="latin-1")
    data.readline() # skip header
    for line in data:
        line = line[:-1]
        slots = line.split("\t")
        lat = float(slots[0])
        lng = float(slots[1])
        lat_rad = lat * math.pi / 180.0
        lng_rad = lng * math.pi / 180.0
        sites[(lat,lng)] = (lat,lng) #(lat_rad,lng_rad)
    return sites

def main():
    color_list = ( "red", "blue", "green", "yellow", "orange", "brown", "pink", "purple" )
    color_idx = 0
    sites_dict = {}
    sites = load_site_data()
    for site in sites: 
        #for each site put it in a dictionarry with its value being an array of neighbors 
        sites_dict[site] = [x for x in sites if x != site and sitesDist(site,x) < threshhold_dist] 

    print("")
    print('map("state", plot=T)') # or use: county instead of state
    print("")


    results = {}
    for site in sites: 
        j = len(sites_dict[site])
        if j >= threshhold_locations:
            coord = bounding_box( site, sites_dict[site] )
            results[coord] = coord

    for bbox in results:
        yx="ylim=c(%s,%s), xlim=c(%s,%s)" % (results[bbox]) #(se_lat,nw_lat,nw_lng,se_lng)

        # important!
        # if you want an individual map for each cluster, uncomment this line
        #print('map("county", plot=T, fill=T, col=palette(), %s)' % yx)
        if len(color_list) == color_idx:
            color_idx = 0
        rect='rect(%s,%s, %s,%s, col=c("%s"))' % (results[bbox][2], results[bbox][0], results[bbox][3], results[bbox][1], color_list[color_idx])
        color_idx += 1
        print(rect)
    print("")


main()

这篇关于检测地理群集的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆