[root@sd camel] # pwd /home/camel/html/cgi-bin/camel [root@sd camel] # vi _conf # seed documents (weight and URL) seed: 1.5|http://ns1.servers.net/ seed: 1.5|http://s2.servers.net/ seed: 1.5|http://s3.servers.net/ seed: 1.5|http://s4.servers.net/ seed: 1.5|http://s5.servers.net/ # host name of the proxy proxyhost: # port number of the proxy proxyport: # waiting interval of each request (in milliseconds) interval: 500 # timeout of each request (in seconds) timeout: 30 # strategy of crawling path (0:balanced, 1:similarity, 2:depth, 3:width, 4:random) strategy: 2 # inheritance ratio of similarity from the parent inherit: 0.4 # maximum depth of seed documents seeddepth: 0 # maximum depth of recursion maxdepth: 10 # standard value for checking mass sites masscheck: 500 # maximum number of records of the priority queue queuesize: 50000 # regular expressions and replacement strings to normalize URLs #replace: ^http://127.0.0.1/{{!}}http://localhost/ # allowing regular expressions of URLs to be visited allowrx: ^http:// # denying regular expressions of URLs to be visited denyrx: \.(css|js|csv|tsv|log|md5|crc|conf|ini|inf|lnk|sys|tmp|bak)$ denyrx: \.(xml|xsl|xslt|rdf|rss|dtd|sgml|sgm)$ denyrx: \.(pgp|sig|cer|csr|pem|key|b64|uu|uue|[0-9])$ denyrx: \.(rtf|pdf|ps|eps|ai|doc|xls|ppt|sxw|sxc|sxi|xdw|jtd|oas|swf)$ denyrx: \.(zip|tar|tgz|gz|bz2|tbz2|z|lha|lzh)(\?.*)?$ denyrx: \.(7z|lzo|lzma|cpio|shar|cab|rar|sit|ace|hqx)(\?.*)?$ denyrx: \.(bin|o|a|so|exe|dll|lib|obj|ocx|class|jar|war)(\?.*)?$ denyrx: \.(rpm|deb|qdb|qdb|dbx|dbf|dat|msi|bat|com|iso)(\?.*)?$ denyrx: \.(png|gif|jpg|jpeg|tif|tiff|bmp|ico|pbm|pgm|ppm|xbm|xpm|dvi)$ denyrx: \.(au|snd|mid|midi|kar|smf|mp2|mp3|m3u|wav|wma|wmp|asx|at3|aif)$ denyrx: \.(mpg|mpeg|qt|mov|avi|wmv|wvx|asf|ram|rm)$ denyrx: (/core$|/core\.[0-9]*$|/camel/) denyrx: ://(localhost|[a-z]*\.localdomain|127\.0\.0\.1)/ # denying regular expressions of URLs to be indexed noidxrx: /\?[a-z]=[a-z](;|$) # URL rules (regular expressions and media types) urlrule: \.est${{!}}text/x-estraier-draft urlrule: \.(eml|mime|mht|mhtml)${{!}}message/rfc822 # media type rules (regular expressions and filter commands) typerule: ^text/x-estraier-draft${{!}}[DRAFT] typerule: ^text/plain${{!}}[TEXT] typerule: ^(text/html|application/xhtml+xml)${{!}}[HTML] typerule: ^message/rfc822${{!}}[MIME] # preferred language (0:English, 1:Japanese, 2:Chinese, 3:Korean, 4:misc) language: 1 # text size limitation (in kilobytes) textlimit: 128 # total number of keywords for seed documents seedkeynum: 256 # number of keywords saved for each document savekeynum: 32 # number of threads running in parallel threadnum: 10 # number of documents to collect docnum: 10000 # running time period (in s:seconds, m:minutes, h:hours, d:days) period: 10000s # revisit span (in s:seconds, m:minutes, h:hours, d:days) revisit: 7d # maximum size of the index cache (in megabytes) cachesize: 256 # remote nodes for alternative indexes (ID number and URL) #nodeserv: 1|http://admin:admin@localhost:1978/node/node1 #nodeserv: 2|http://admin:admin@localhost:1978/node/node2 #nodeserv: 3|http://admin:admin@localhost:1978/node/node3 # path of the log file (relative path or absolute path) logfile: _log # logging level (1:debug, 2:information, 3:warning, 4:error, 5:none) loglevel: 2 # path of the draft directory (relative path or absolute path) draftdir: # path of the entity directory (relative path or absolute path) entitydir: # postprocessor for retrieved files postproc: :wq [root@sd camel] # estwaver crawl camel