[root@sd camel] # pwd
/home/camel/html/cgi-bin/camel
[root@sd camel] # vi _conf
# seed documents (weight and URL)
seed: 1.5|http://ns1.servers.net/
seed: 1.5|http://s2.servers.net/
seed: 1.5|http://s3.servers.net/
seed: 1.5|http://s4.servers.net/
seed: 1.5|http://s5.servers.net/
# host name of the proxy
proxyhost:
# port number of the proxy
proxyport:
# waiting interval of each request (in milliseconds)
interval: 500
# timeout of each request (in seconds)
timeout: 30
# strategy of crawling path (0:balanced, 1:similarity, 2:depth, 3:width, 4:random)
strategy: 2
# inheritance ratio of similarity from the parent
inherit: 0.4
# maximum depth of seed documents
seeddepth: 0
# maximum depth of recursion
maxdepth: 10
# standard value for checking mass sites
masscheck: 500
# maximum number of records of the priority queue
queuesize: 50000
# regular expressions and replacement strings to normalize URLs
#replace: ^http://127.0.0.1/{{!}}http://localhost/
# allowing regular expressions of URLs to be visited
allowrx: ^http://
# denying regular expressions of URLs to be visited
denyrx: \.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$
denyrx: \.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$
denyrx: \.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$
denyrx: \.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$
denyrx: \.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)(\?.*)?$
denyrx: \.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)(\?.*)?$
denyrx: \.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)(\?.*)?$
denyrx: \.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)(\?.*)?$
denyrx: \.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$
denyrx: \.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$
denyrx: \.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$
denyrx: (/core$|/core\.[0-9]*$|/camel/)
denyrx: ://(localhost|[a-z]*\.localdomain|127\.0\.0\.1)/
# denying regular expressions of URLs to be indexed
noidxrx: /\?[a-z]=[a-z](;|$)
# URL rules (regular expressions and media types)
urlrule: \.est${{!}}text/x-estraier-draft
urlrule: \.(eml|mime|mht|mhtml)${{!}}message/rfc822
# media type rules (regular expressions and filter commands)
typerule: ^text/x-estraier-draft${{!}}[DRAFT]
typerule: ^text/plain${{!}}[TEXT]
typerule: ^(text/html|application/xhtml+xml)${{!}}[HTML]
typerule: ^message/rfc822${{!}}[MIME]
# preferred language (0:English, 1:Japanese, 2:Chinese, 3:Korean, 4:misc)
language: 1
# text size limitation (in kilobytes)
textlimit: 128
# total number of keywords for seed documents
seedkeynum: 256
# number of keywords saved for each document
savekeynum: 32
# number of threads running in parallel
threadnum: 10
# number of documents to collect
docnum: 10000
# running time period (in s:seconds, m:minutes, h:hours, d:days)
period: 10000s
# revisit span (in s:seconds, m:minutes, h:hours, d:days)
revisit: 7d
# maximum size of the index cache (in megabytes)
cachesize: 256
# remote nodes for alternative indexes (ID number and URL)
#nodeserv: 1|http://admin:admin@localhost:1978/node/node1
#nodeserv: 2|http://admin:admin@localhost:1978/node/node2
#nodeserv: 3|http://admin:admin@localhost:1978/node/node3
# path of the log file (relative path or absolute path)
logfile: _log
# logging level (1:debug, 2:information, 3:warning, 4:error, 5:none)
loglevel: 2
# path of the draft directory (relative path or absolute path)
draftdir:
# path of the entity directory (relative path or absolute path)
entitydir:
# postprocessor for retrieved files
postproc:
:wq
[root@sd camel] # estwaver crawl camel