为什么循环在后续程序中执行需要很长时间 [英] Why for loops are taking to much time to execute in following program
问题描述
在这个程序中,我已经为文本提取逻辑提供了wikipeadia URL,但在提取文本后for循环需要花费很多时间来执行。
在python程序中的逻辑相同。
如何缩短执行时间?
import java。 io.IOException;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TextExtraction1
{
static TextExtraction1 fj;
public String toHtmlString(String url)throws IOException
{
StringBuilder sb = new StringBuilder();
for(Scanner sc = new Scanner(new URL(url).openStream()); sc.hasNext();)
sb.append(sc.nextLine())。append('\ N');
返回sb.toString();
}
static int search(String key,String target)
{
int count = 0;
模式p = Pattern.compile(key);
Matcher m = p.matcher(目标);
while(m.find()){count ++;}
返回计数;
}
String extractText(String s)throws IOException
{
String h1 = fj.toHtmlString(s);
System.out.println(extract \\\
\ n);
int i2 = 0;
String h2 [] = h1.split(\ n);
String html =;
long start = System.currentTimeMillis();
for(String h3:h2)
{//bw.write(h3); bw.newLine();
html + = h3;
html + =; // IU =国际单位+ 1;
}
long end = System.currentTimeMillis();
System.out.println(++ i2 +th loop end in+(end-start)/ 1000 +seconds);
boolean capture = true;
String filtered_text =;
String html_text [] = html.split(<);
String h_text []; // System.out.println(kyhe1);
start = System.currentTimeMillis();
for(String h:html_text)
{
h =< + h;
h_text = h.split(>);
for(String w:h_text)
{
if(w.length()> 0){if(w.substring(0,1).equals(<) ){w + =>;}}
if(search(< / script>,w)> 0){capture = true;}
else if(search( < script,w)> 0){capture = false;}
else if(capture){filtered_text + = w; filtered_text + =\ n;}}
}
}
// System.out.println(kyhe1);
end = System.currentTimeMillis();
html_text = filtered_text.split(\ n);
System.out.println(++ i2 +th loop end in+(end-start)/ 1000 +seconds);
return html_text [0];
}
public static void main(String [] args)抛出IOException
{
fj = new TextExtraction1();
System.out.println(fj.extractText(https://en.wikipedia.org/wiki/Varanasi));
}
}
相同的python代码太快
import urllib2
import re
import sys
def get_text(f1) :#(f1)
h1 = f1.read()#f1.read()
html =''#h3是一个字符串
h2 = h1.split('\ n'' )
f = open(guru99.txt,w +)
h3 in h2:
html + = h3
html + =''
capture = True
filtered_text =''
html_text = html.split('<')
i = 0
for h in html_text:
h ='<'+ h
h_text = h.split('>')
for w in h_text:
if w :
如果w [0] =='<':
w + ='>'
如果re.search(r'< / script>',w ):
c apture = True
elif re.search(r'< script',w):
capture = False
else:
if capture:
filtered_text + = w
filtered_text + ='\ n'
def get_url_text(url):
try:
f = urllib2.urlopen(url)
除外(urllib2.HTTPError,urllib2.URLError):
返回'\ n'
else:
return get_text(f)
def main():
get_url_text(sys.argv [1])$ b $ b如果__name__ ==__ main__:main()
我尝试了什么:
i刚刚将for loop转换为while循环
String h3 =; int i3 = 0;
while(i3< h2.length)
{//bw.write(h3); bw.newLine();
h3 = h2 [i3];
html + = h3;
html + =; i3 ++; // IU =国际单位+ 1;
}
您应该尝试优化Java代码。
通过避免在循环内创建动态对象,可以实现最佳优化。
例如:
#PHP
if w:
if w [0] = = ' <':
w + = ' >'
// Java
if (w.length( )> 0)
{
if (w.substring( 0 , 1 )。equals( <))
{
w + = >;
}
}
她的substring
将动态创建一个新字符串并执行字符串比较。
为什么不直接使用String.charAt()
并执行字符比较?
if (w.length()> 0)
{
if (w.charAt( 0 )== ' <')
{
w + = >跨度>;
}
}
另一个优化可能是使用类或静态成员来存储使用的正则表达式搜索Pattern
秒。然后Pattern.compile()
不能多次执行。
有一个工具可以让你知道程序花在哪里时间,它的名字是Profiler。
分析(计算机编程) - 维基百科 [ ^ ]
每次必须连接字符串时,你应该尝试使用StringBuilder
。
注意
filtered_text + = w; filtered_text + = \ n;
慢于
filtered_text + = w + \\\
跨度>;
in this program, i have given wikipeadia URL for text extraction logic but after extraction of text "for loops" are taking to much time to execute.
the same logic too fast in python program.
how to reduces execution time ?
import java.io.IOException; import java.net.URL; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; public class TextExtraction1 { static TextExtraction1 fj; public String toHtmlString(String url) throws IOException { StringBuilder sb = new StringBuilder(); for(Scanner sc = new Scanner(new URL(url).openStream()); sc.hasNext(); ) sb.append(sc.nextLine()).append('\n'); return sb.toString(); } static int search(String key,String target) { int count=0; Pattern p=Pattern.compile(key); Matcher m=p.matcher(target); while(m.find()){count++;} return count; } String extractText(String s) throws IOException { String h1 = fj.toHtmlString(s); System.out.println("extracted \n\n"); int i2=0; String h2[] = h1.split("\n"); String html=""; long start = System.currentTimeMillis(); for(String h3:h2) { //bw.write(h3);bw.newLine(); html += h3; html += ""; //iu=iu+1; } long end = System.currentTimeMillis(); System.out.println(++i2+" th loop end in "+(end-start)/1000+" seconds"); boolean capture = true; String filtered_text = ""; String html_text[] = html.split("<"); String h_text[];//System.out.println("kyhe1"); start = System.currentTimeMillis(); for(String h:html_text) { h = "<" + h; h_text = h.split(">"); for(String w :h_text) { if(w.length()>0) { if(w.substring(0, 1).equals("<")){w +=">";} } if(search("</script>",w)>0){capture=true;} else if(search("<script",w)>0){capture=false;} else if(capture){filtered_text += w; filtered_text += "\n";} } } // System.out.println("kyhe1"); end = System.currentTimeMillis(); html_text = filtered_text.split("\n"); System.out.println(++i2+" th loop end in "+(end-start)/1000+" seconds"); return html_text[0]; } public static void main(String []args)throws IOException { fj = new TextExtraction1(); System.out.println(fj.extractText("https://en.wikipedia.org/wiki/Varanasi")); } }
Same python code is too fast
import urllib2 import re import sys def get_text(f1): #(f1) h1 = f1.read() #f1.read() html = '' # h3 is a string h2 = h1.split('\n') f= open("guru99.txt","w+") for h3 in h2: html += h3 html += ' ' capture = True filtered_text = '' html_text = html.split('<') i=0 for h in html_text: h = '<' + h h_text = h.split('>') for w in h_text: if w: if w[0] == '<': w += '>' if re.search(r'</script>', w): capture = True elif re.search(r'<script', w): capture = False else: if capture: filtered_text += w filtered_text += '\n' def get_url_text(url): try : f = urllib2.urlopen(url) except (urllib2.HTTPError,urllib2.URLError) : return '\n' else: return get_text(f) def main(): get_url_text(sys.argv[1]) if __name__ == "__main__": main()
What I have tried:
i just converted "for loop" into while loop
String h3="";int i3=0; while(i3<h2.length) { //bw.write(h3);bw.newLine(); h3=h2[i3]; html += h3; html += "";i3++; //iu=iu+1; }
You should try to optimise the Java code.
The best optimisation can be achieved by avoiding dynamic object creation inside loops.
An example:
# PHP if w: if w[0] == '<': w += '>'
// Java if(w.length()>0) { if(w.substring(0, 1).equals("<")) { w +=">"; } }
Hersubstring
will create a new string dynamically and perform a string comparison.
Why not just useString.charAt()
and perform a character comparison?
if(w.length()>0) { if(w.charAt(0) == '<') { w += ">"; } }
Another optimisation might be using class or static members to store the used regex searchPattern
s. ThenPattern.compile()
has not to be executed multiple times.
There is a tool that lets you know where a program spend time, its name is Profiler.
Profiling (computer programming) - Wikipedia[^]
You should try to useStringBuilder
every time you have to concatenate strings.
Note that
filtered_text += w; filtered_text += "\n";
is slower than
filtered_text += w + "\n";
这篇关于为什么循环在后续程序中执行需要很长时间的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!