在Haskell中运行并行URL下载 [英] Running parallel URL downloads in Haskell
问题描述
模块Main其中
import Control.Monad(filterM
,liftM
)
import Data.Maybe(fromJust)
import Network.HTTP(RequestMethod(GET)
,rspBody
,simpleHTTP
)
import Network.HTTP.Base(Request(..))
import Network.URI(parseURI)
import System.Directory (doFileExist)
import System.Environment(getArgs)
import System.IO(hClose
,hPutStr
,hPutStrLn
,IOMode(WriteMode)
, openFile
,stderr
)
import Text.Printf(printf)
indices :: [String]
indices =
map format1 [ 0..9] ++ map format2 [0..14] ++ [40001-41284:: String]
其中
format1 index =
printf%d-%d((index * 1000 + 1):: Int)
(((index + 1)* 1000):: Int)
format2 index =
printf%d-%d((10000 + 2 * index * 1000 + 1):: Int)
((10000 +(2 * index + 2)* 1000 ):: Int)
$ b $ main main :: IO()
main = do
[dir]< - getArgs
updateDownloads dir
updateDownloads :: FilePath - > IO()
updateDownloads path = do
let
fileNames = map(\index - >
(index,path ++/ tv_and_movie_freqlist++ index ++ .html))索引
缺少< -
filterM(\(_,fileName) - > liftM不是$ doesFileExist fileName)fileNames
mapM_(\(index,fileName) - > do
let
url =
http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/++
index
request =
请求
{rqURI = fromJust $ parseURI url
,rqMethod = GET
,rqHeaders = []
,rqBody =
}
hPutStrLn stderr $Downloading++显示url
resp< - simpleHTTP请求
的情况resp = $ Left→ - > hPutStrLn stderr $连接到+ + show url
正确的响应 - > do
let
html = rspBody响应
文件< - openFile fileName WriteMo de
hPutStr file html
hClose file
return())missing
我想要并行运行下载。我知道 par
,但我不确定它是否可以在 IO
monad中使用,如果是这样, ?
更新:这里是我的代码,使用 Control.Concurrent.Async
重新实现, mapConcurrently
:
模块Main其中
import Control.Concurrent.Async(mapConcurrently)
import Control.Monad(filterM
,liftM
)
import Data.Maybe(fromJust)
import Network.HTTP( RequestMethod(GET)
,rspBody
,simpleHTTP
)
import Network.HTTP.Base(Request(..))
import Network.URI(parseURI)
import System.Directory(doesFileExist)
import System.Environment(getArgs)
import System.IO(hClose
,hPutStr
,hPutStrLn
,IOMode( WriteMode)
,openFile
,stderr
)
import Text.Printf(printf)
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ [40001-41284:: String]
其中
format1 index =
printf%d-%d((index * 1000 + 1):: Int)
(((index + 1)* 1000):: Int)
format2 index =
printf%d-%d((10000+ *(index * 1000 + 1):: int)
((10000+(2 * index + 2)* 1000):: Int)
$ b $ main :: IO()
main = do
[dir]< - getArgs
updateDownloads dir
updateDownloads :: FilePath - > IO()
updateDownloads path = do
let
fileNames = map(\index - >
(index,path ++/ tv_and_movie_freqlist++ index ++ .html))索引
缺少< -
filterM(\(_,fileName) - > liftM不是$ doesFileExist fileName)fileNames
页面< -
mapConcurrently (\(index,fileName) - > getUrl index fileName)missing
mapM_(\(fileName,html) - > do
handle< - openFile fileName WriteMode
hPutStr handle html
hClose句柄)pages
where
getUrl :: String - > FilePath - > IO(FilePath,String)
getUrl index fileName = do
let
url =
http://zh.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006 /++
索引
请求=
请求
{rqURI = fromJust $ parseURI url
,rqMethod = GET
,rqHeaders = []
,rqBody =
}
resp< - simpleHTTP请求
case $ res
Left _ - >
hPutStrLn stderr $连接到++显示网址
返回(,)
正确响应 - >
return(fileName,rspBody response)
查看 mapConcurrently
来自Simon Marlow的异步库。
它将 IO
动作并行且异步地映射到 Traversable
容器并等待所有操作。
示例:
{ - #LANGUAGE PackageImports# - }
导入System.Environment(getArgs)
导入asyncControl.Concurrent.Async(mapConcurrently)
导入HTTPNetwork.HTTP
导入HTTPNetwork.Stream(结果)
导入HTTPNetwork.HTTP.Base(响应(..))
import System.IO
importurlNetwork.URL(encString)
import Control.Monad
getURL :: String - > IO(String,Result(Response String))
getURL url = do
res< - (simpleHTTP。getRequest)url
return(url,res)
main = do
args< - getArgs
case $ args
[] - > putStrLn用法:程序url1 url2 ... urlN
args - > do
results< - mapConcurrently getURL args
forM_ results $ \(url,res) - >做
案例res
Left connError - > putStrLn $ url ++;++显示connError
正确的回应 - >做
putStrLn $ url ++; OK
let content = rspBody response
- 从url创建名字
fname = encString True(`notElem` :/)url ++.html
writeFile fname content
Below is Haskell code that (HTTP) downloads files that are missing from the given directory:
module Main where
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
mapM_ (\(index, fileName) -> do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
hPutStrLn stderr $ "Downloading " ++ show url
resp <- simpleHTTP request
case resp of
Left _ -> hPutStrLn stderr $ "Error connecting to " ++ show url
Right response -> do
let
html = rspBody response
file <- openFile fileName WriteMode
hPutStr file html
hClose file
return ()) missing
I would like to run the downloads in parallel. I know about par
, but am not sure if it can be used in the IO
monad, and if so, how?
UPDATE: Here is my code reimplemented using Control.Concurrent.Async
and mapConcurrently
:
module Main where
import Control.Concurrent.Async ( mapConcurrently )
import Control.Monad ( filterM
, liftM
)
import Data.Maybe ( fromJust )
import Network.HTTP ( RequestMethod(GET)
, rspBody
, simpleHTTP
)
import Network.HTTP.Base ( Request(..) )
import Network.URI ( parseURI )
import System.Directory ( doesFileExist )
import System.Environment ( getArgs )
import System.IO ( hClose
, hPutStr
, hPutStrLn
, IOMode(WriteMode)
, openFile
, stderr
)
import Text.Printf ( printf )
indices :: [String]
indices =
map format1 [0..9] ++ map format2 [0..14] ++ ["40001-41284" :: String]
where
format1 index =
printf "%d-%d" ((index * 1000 + 1) :: Int)
(((index + 1) * 1000) :: Int)
format2 index =
printf "%d-%d" ((10000 + 2 * index * 1000 + 1) :: Int)
((10000 + (2 * index + 2) * 1000) :: Int)
main :: IO ()
main = do
[dir] <- getArgs
updateDownloads dir
updateDownloads :: FilePath -> IO ()
updateDownloads path = do
let
fileNames = map (\index ->
(index, path ++ "/tv_and_movie_freqlist" ++ index ++ ".html")) indices
missing <-
filterM (\(_, fileName) -> liftM not $ doesFileExist fileName) fileNames
pages <-
mapConcurrently (\(index, fileName) -> getUrl index fileName) missing
mapM_ (\(fileName, html) -> do
handle <- openFile fileName WriteMode
hPutStr handle html
hClose handle) pages
where
getUrl :: String -> FilePath -> IO (FilePath, String)
getUrl index fileName = do
let
url =
"http://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/TV/2006/" ++
index
request =
Request
{ rqURI = fromJust $ parseURI url
, rqMethod = GET
, rqHeaders = []
, rqBody = ""
}
resp <- simpleHTTP request
case resp of
Left _ -> do
hPutStrLn stderr $ "Error connecting to " ++ show url
return ("", "")
Right response ->
return (fileName, rspBody response)
Have a look at mapConcurrently
from Simon Marlow's "async" library.
It maps an IO
action in parallel and asynchronously to the elements of a Traversable
container and waits for all actions.
Example:
{-# LANGUAGE PackageImports #-}
import System.Environment (getArgs)
import "async" Control.Concurrent.Async (mapConcurrently)
import "HTTP" Network.HTTP
import "HTTP" Network.Stream (Result)
import "HTTP" Network.HTTP.Base (Response(..))
import System.IO
import "url" Network.URL (encString)
import Control.Monad
getURL :: String -> IO (String, Result (Response String))
getURL url = do
res <- (simpleHTTP . getRequest) url
return (url, res)
main = do
args <- getArgs
case args of
[] -> putStrLn "usage: program url1 url2 ... urlN"
args -> do
results <- mapConcurrently getURL args
forM_ results $ \(url, res) -> do
case res of
Left connError -> putStrLn $ url ++ "; " ++ show connError
Right response -> do
putStrLn $ url ++ "; OK"
let content = rspBody response
-- make name from url
fname = encString True (`notElem` ":/") url ++ ".html"
writeFile fname content
这篇关于在Haskell中运行并行URL下载的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!