在Haskell中运行并行URL下载 [英] Running parallel URL downloads in Haskell

查看:76
本文介绍了在Haskell中运行并行URL下载的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

下面是Haskell代码(HTTP)下载给定目录中缺少的文件:

 模块Main其中

import Control.Monad(filterM
,liftM

import Data.Maybe(fromJust)
import Network.HTTP(RequestMethod(GET)
,rspBody
,simpleHTTP

import Network.HTTP.Base(Request(..))
import Network.URI(parseURI)
import System.Directory (doFileExist)
import System.Environment(getArgs)
import System.IO(hClose
,hPutStr
,hPutStrLn
,IOMode(WriteMode)
, openFile
,stderr

import Text.Printf(printf)

indices :: [String]
indices =
map format1 [ 0..9] ++ map format2 [0..14] ++ [40001-41284:: String]
其中
format1 index =
printf%d-%d((index * 1000 + 1):: Int)
(((index + 1)* 1000):: Int)
format2 index =
printf%d-%d((10000 + 2 * index * 1000 + 1):: Int)
((10000 +(2 * index + 2)* 1000 ):: Int)
$ b $ main main :: IO()
main = do
[dir]< - getArgs
updateDownloads dir

updateDownloads :: FilePath - > IO()
updateDownloads path = do
let
fileNames = map(\index - >
(index,path ++/ tv_and_movie_freqlist++ index ++ .html))索引
缺少< -
filterM(\(_,fileName) - > liftM不是$ doesFileExist fileName)fileNames
mapM_(\(index,fileName) - > do
let
url =
http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/++
index
request =
请求
{rqURI = fromJust $ parseURI url
,rqMethod = GET
,rqHeaders = []
,rqBody =
}
hPutStrLn stderr $Downloading++显示url
resp< - simpleHTTP请求

的情况resp = $ Left→ - > hPutStrLn stderr $连接到+ + show url
正确的响应 - > do
let
html = rspBody响应
文件< - openFile fileName WriteMo de
hPutStr file html
hClose file
return())missing

我想要并行运行下载。我知道 par ,但我不确定它是否可以在 IO monad中使用,如果是这样, ?



更新:这里是我的代码,使用 Control.Concurrent.Async 重新实现, mapConcurrently

 模块Main其中

import Control.Concurrent.Async(mapConcurrently)
import Control.Monad(filterM
,liftM

import Data.Maybe(fromJust)
import Network.HTTP( RequestMethod(GET)
,rspBody
,simpleHTTP

import Network.HTTP.Base(Request(..))
import Network.URI(parseURI)
import System.Directory(doesFileExist)
import System.Environment(getArgs)
import System.IO(hClose
,hPutStr
,hPutStrLn
,IOMode( WriteMode)
,openFile
,stderr

import Text.Printf(printf)

indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ [40001-41284:: String]
其中
format1 index =
printf%d-%d((index * 1000 + 1):: Int)
(((index + 1)* 1000):: Int)
format2 index =
printf%d-%d((10000+ *(index * 1000 + 1):: int)
((10000+(2 * index + 2)* 1000):: Int)
$ b $ main :: IO()
main = do
[dir]< - getArgs
updateDownloads dir

updateDownloads :: FilePath - > IO()
updateDownloads path = do
let
fileNames = map(\index - >
(index,path ++/ tv_and_movie_freqlist++ index ++ .html))索引
缺少< -
filterM(\(_,fileName) - > liftM不是$ doesFileExist fileName)fileNames
页面< -
mapConcurrently (\(index,fileName) - > getUrl index fileName)missing
mapM_(\(fileName,html) - > do
handle< - openFile fileName WriteMode
hPutStr handle html
hClose句柄)pages
where
getUrl :: String - > FilePath - > IO(FilePath,String)
getUrl index fileName = do
let
url =
http://zh.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006 /++
索引
请求=
请求
{rqURI = fromJust $ parseURI url
,rqMethod = GET
,rqHeaders = []
,rqBody =
}
resp< - simpleHTTP请求
case $ res
Left _ - >
hPutStrLn stderr $连接到++显示网址
返回(,)
正确响应 - >
return(fileName,rspBody response)


解决方案

查看 mapConcurrently 来自Simon Marlow的异步库。

它将 IO 动作并行且异步地映射到 Traversable 容器并等待所有操作。



示例:

  { - #LANGUAGE PackageImports# - } 

导入System.Environment(getArgs)

导入asyncControl.Concurrent.Async(mapConcurrently)

导入HTTPNetwork.HTTP
导入HTTPNetwork.Stream(结果)
导入HTTPNetwork.HTTP.Base(响应(..))
import System.IO
importurlNetwork.URL(encString)

import Control.Monad


getURL :: String - > IO(String,Result(Response String))
getURL url = do
res< - (simpleHTTP。getRequest)url
return(url,res)

main = do
args< - getArgs
case $ args
[] - > putStrLn用法:程序url1 url2 ... urlN
args - > do
results< - mapConcurrently getURL args
forM_ results $ \(url,res) - >做
案例res
Left connError - > putStrLn $ url ++;++显示connError
正确的回应 - >做
putStrLn $ url ++; OK
let content = rspBody response

- 从url创建名字
fname = encString True(`notElem` :/)url ++.html
writeFile fname content


Below is Haskell code that (HTTP) downloads files that are missing from the given directory:

module Main where

import Control.Monad ( filterM
                     , liftM
                     )
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
                    , rspBody
                    , simpleHTTP
                    )
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
                 , hPutStr
                 , hPutStrLn
                 , IOMode(WriteMode)
                 , openFile
                 , stderr
                 )
import Text.Printf ( printf )

indices :: [String]
indices =
  map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
  where
    format1 index =
      printf "%d-%d" ((index * 1000 + 1) :: Int)
                     (((index + 1) * 1000) :: Int)
    format2 index =
      printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
                     ((10000 + (2 * index + 2) * 1000) :: Int)

main :: IO ()
main = do
  [dir] <- getArgs
  updateDownloads dir

updateDownloads :: FilePath -> IO ()
updateDownloads path = do
  let
    fileNames = map (\index ->
      (index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
  missing <-
    filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
  mapM_ (\(index, fileName) -> do
    let
      url =
        "http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
        index
      request =
        Request
          { rqURI = fromJust $ parseURI url
          , rqMethod = GET
          , rqHeaders = []
          , rqBody = ""
          }
    hPutStrLn stderr $ "Downloading " ++ show url
    resp <- simpleHTTP request
    case resp of
      Left _ -> hPutStrLn stderr $ "Error connecting to " ++ show url
      Right response -> do
        let
          html = rspBody response
        file <- openFile fileName WriteMode
        hPutStr file html
        hClose file
    return ()) missing

I would like to run the downloads in parallel. I know about par, but am not sure if it can be used in the IO monad, and if so, how?

UPDATE: Here is my code reimplemented using Control.Concurrent.Async and mapConcurrently:

module Main where

import Control.Concurrent.Async ( mapConcurrently )
import Control.Monad ( filterM
                     , liftM
                     )
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
                    , rspBody
                    , simpleHTTP
                    )
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
                 , hPutStr
                 , hPutStrLn
                 , IOMode(WriteMode)
                 , openFile
                 , stderr
                 )
import Text.Printf ( printf )

indices :: [String]
indices =
  map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
  where
    format1 index =
      printf "%d-%d" ((index * 1000 + 1) :: Int)
                     (((index + 1) * 1000) :: Int)
    format2 index =
      printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
                     ((10000 + (2 * index + 2) * 1000) :: Int)

main :: IO ()
main = do
  [dir] <- getArgs
  updateDownloads dir

updateDownloads :: FilePath -> IO ()
updateDownloads path = do
  let
    fileNames = map (\index ->
      (index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
  missing <-
    filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
  pages <-
    mapConcurrently (\(index, fileName) -> getUrl index fileName) missing
  mapM_ (\(fileName, html) -> do
    handle <- openFile fileName WriteMode
    hPutStr handle html
    hClose handle) pages
  where
    getUrl :: String -> FilePath -> IO (FilePath, String)
    getUrl index fileName = do
      let
        url =
          "http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
          index
        request =
          Request
          { rqURI = fromJust $ parseURI url
          , rqMethod = GET
          , rqHeaders = []
          , rqBody = ""
          }
      resp <- simpleHTTP request
      case resp of
        Left _ -> do
          hPutStrLn stderr $ "Error connecting to " ++ show url
          return ("", "")
        Right response ->
          return (fileName, rspBody response)

解决方案

Have a look at mapConcurrently from Simon Marlow's "async" library.

It maps an IO action in parallel and asynchronously to the elements of a Traversable container and waits for all actions.

Example:

{-# LANGUAGE PackageImports #-}

import System.Environment (getArgs)

import "async" Control.Concurrent.Async (mapConcurrently)

import "HTTP" Network.HTTP
import "HTTP" Network.Stream (Result)
import "HTTP" Network.HTTP.Base (Response(..))
import System.IO
import "url" Network.URL (encString)

import Control.Monad


getURL :: String -> IO (String, Result (Response String))
getURL url = do
        res <- (simpleHTTP . getRequest) url
        return (url, res)

main = do
     args <- getArgs
     case args of
          [] -> putStrLn "usage: program url1 url2 ... urlN"
          args -> do
                results <- mapConcurrently getURL args
                forM_ results $ \(url, res) -> do
                        case res of
                                Left connError -> putStrLn $ url ++ "; " ++ show connError
                                Right response -> do
                                        putStrLn $ url ++ "; OK"
                                        let content = rspBody response

                                            -- make name from url
                                            fname = encString True (`notElem` ":/") url ++ ".html"
                                        writeFile fname content    

这篇关于在Haskell中运行并行URL下载的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆