[root@sd cgi-bin]# estwaver init camel [root@sd cgi-bin]# ls camel/ estseek.cgi* estseek.conf estseek.help estseek.tmpl estseek.top estseek_web.cgi* estseek_web.conf index/ [root@sd cgi-bin]# nano estseek_web.conf # indexname: /home/camel/html/cgi-bin/camel/_index tmplfile: estseek.tmpl topfile: estseek.top helpfile: estseek.help lockindex: true pseudoindex: #replace: ^file:///home/mikio/public_html/{{!}}http://localhost/ #replace: /index\.html?${{!}}/ # [root@sd cgi-bin]# cd camel [root@sd camel]# ls _conf _index/ _log _meta _queue _tmp/ _trace/ [root@sd camel]# nano _conf # seed documents (weight and URL) seed: 3.0|http://s1.servers.net/ seed: 3.0|http://s2.servers.net/ seed: 3.0|http://s3.servers.net/ seed: 3.0|http://s4.servers.net/ seed: 3.0|http://s5.servers.net/ seed: 3.0|http://s6.servers.net/ seed: 3.0|http://s7.servers.net/ seed: 3.0|http://s8.servers.net/ # host name of the proxy proxyhost: # port number of the proxy proxyport: # waiting interval of each request (in milliseconds) interval: 500 # timeout of each request (in seconds) timeout: 30 # strategy of crawling path (0:balanced, 1:similarity, 2:depth, 3:width, 4:random) strategy: 3 # inheritance ratio of similarity from the parent inherit: 0.4 # maximum depth of seed documents seeddepth: 0 # maximum depth of recursion maxdepth: 8 # standard value for checking mass sites masscheck: 500 # maximum number of records of the priority queue queuesize: 50000 # regular expressions and replacement strings to normalize URLs # replace: ^http://127.0.0.1/{{!}}http://localhost/ # allowing regular expressions of URLs to be visited allowrx: ^http:// # denying regular expressions of URLs to be visited denyrx: \.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$ denyrx: \.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$ denyrx: \.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$ denyrx: \.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$ denyrx: \.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)(\?.*)?$ denyrx: \.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)(\?.*)?$ denyrx: \.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)(\?.*)?$ denyrx: \.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)(\?.*)?$ denyrx: \.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$ denyrx: \.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$ denyrx: \.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$ denyrx: (/core$|/core\.[0-9]*$|/camel/) denyrx: ://(localhost|[a-z]*\.localdomain|127\.0\.0\.1)/ # preferred language (0:English, 1:Japanese, 2:Chinese, 3:Korean, 4:misc) language: 1 # text size limitation (in kilobytes) textlimit: 128 # total number of keywords for seed documents seedkeynum: 256 # number of keywords saved for each document savekeynum: 32 # number of threads running in parallel threadnum: 10 # number of documents to collect docnum: 30000 # running time period (in s:seconds, m:minutes, h:hours, d:days) period: 10000s # revisit span (in s:seconds, m:minutes, h:hours, d:days) revisit: 7d # maximum size of the index cache (in megabytes) cachesize: 256 # [root@sd camel]# su - [root@sd ~]# cd /home/camel/html/cgi-bin [root@sd cgi-bin]# estwaver crawl camel