<?xml version="1.0" encoding="UTF-8"?>
<crawl-order xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="heritrix_settings.xsd">
<meta>
<name>forsider_plus_2niveauer_facebook</name>
<description>Profile to harvest the frontpages and two hops with double-slash removal.
max-toe-threads=100,
max-hops=2
timeout-seconds=300
pool-max-active=2
</description>
<operator>Admin</operator>
<organization/>
<audience>TESTERS</audience>
<date>20080118111217</date>
</meta>
<controller>
<string name="settings-directory">settings</string>
<string name="disk-path"/>
<string name="logs-path">logs</string>
<string name="checkpoints-path">checkpoints</string>
<string name="state-path">state</string>
<string name="scratch-path">scratch</string>
<long name="max-bytes-download">0</long>
<long name="max-document-download">0</long>
<long>0</long>
<integer name="max-toe-threads">50</integer>
<integer name="recorder-out-buffer-bytes">4096</integer>
<integer name="recorder-in-buffer-bytes">65536</integer>
<integer name="bdb-cache-percent">0</integer>
<!-- DecidingScope migrated from DomainScope -->
<newObject name="scope" class="org.archive.crawler.deciderules.DecidingScope">
<boolean name="enabled">true</boolean>
<string name="seedsfile">seeds.txt</string>
<boolean name="reread-seeds-on-config">true</boolean>
<!-- DecideRuleSequence. Multiple DecideRules applied in order with last non-PASS the resulting decision -->
<newObject name="decide-rules">
<map>
<newObject name="rejectByDefault" class="org.archive.crawler.deciderules.RejectDecideRule"/>
<newObject name="acceptURIFromSeedDomains" class="dk.netarkivet.harvester.harvesting.OnNSDomainsDecideRule">
<string name="decision">ACCEPT</string>
<string/>
<boolean name="seeds-as-surt-prefixes">true</boolean>
<string/>
<boolean>false</boolean>
<boolean name="rebuild-on-reconfig">true</boolean>
</newObject>
<newObject class="org.archive.crawler.deciderules.TooManyHopsDecideRule">
<integer name="max-hops">25</integer>
</newObject>
<newObject name="rejectIfPathological" class="org.archive.crawler.deciderules.PathologicalPathDecideRule">
<integer name="max-repetitions">3</integer>
</newObject>
<newObject name="acceptIfTranscluded" class="org.archive.crawler.deciderules.TransclusionDecideRule">
<integer name="max-trans-hops">3</integer>
<integer name="max-speculative-hops">1</integer>
</newObject>
<newObject name="pathdepthfilter" class="org.archive.crawler.deciderules.TooManyPathSegmentsDecideRule">
<integer name="max-path-depth">20</integer>
</newObject>
<newObject name="webbyen">
<string name="decision">REJECT</string>
<string name="regexp">.*webbyen.*kontakt\.asp.*</string>
</newObject>
<newObject name="dr_dk">
<string name="decision">REJECT</string>
<string name="regexp">.*dr\.dk.*epg\.asp.*</string>
</newObject>
<newObject name="stiften">
<string name="decision">REJECT</string>
<string name="regexp">.*stiften.*adstream_mjx\.ads.*</string>
</newObject>
<newObject name="halibut_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">
<string name="decision">REJECT</string>
<string name="regexp">.*halibut\.dk\/cgi-bin.*</string>
</newObject>
<newObject name="cybercomputer_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">
<string name="decision">REJECT</string>
<string name="regexp">.*cybercomputer\.dk\/putikurv.*</string>
</newObject>
<newObject name="tawselovers_dk" class="org.archive.crawler.deciderules.MatchesRegExpDecideRule">
<string name="decision">REJECT</string>
<string name="regexp">.*tawselovers.*action=buy_now.*</string>
</newObject>
</map> <!-- end rules -->
</newObject> <!-- end decide-rules -->
</newObject> <!-- End DecidingScope -->
<map name="http-headers">
<string name="user-agent">Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://netarkivet.dk/website/info.html)</string>
<string name="from">netarkivet-svar@netarkivet.dk</string>
</map>
<newObject name="robots-honoring-policy" class="org.archive.crawler.datamodel.RobotsHonoringPolicy">
<string name="type">ignore</string>
<boolean name="masquerade">false</boolean>
<text name="custom-robots"/>
<stringList name="user-agents">
</stringList>
</newObject>
<newObject name="frontier" class="org.archive.crawler.frontier.BdbFrontier">
<float name="delay-factor">1.0</float>
<integer name="max-delay-ms">1000</integer>
<integer name="min-delay-ms">300</integer>
<integer name="max-retries">3</integer>
<long name="retry-delay-seconds">300</long>
<integer name="preference-embed-hops">1</integer>
<integer name="total-bandwidth-usage-KB-sec">1500</integer>
<integer name="max-per-host-bandwidth-usage-KB-sec">500</integer>
<string name="queue-assignment-policy">dk.netarkivet.harvester.harvesting.DomainnameQueueAssignmentPolicy</string>
<string name="force-queue-assignment"/>
<boolean name="pause-at-start">false</boolean>
<boolean name="pause-at-finish">false</boolean>
<boolean name="source-tag-seeds">false</boolean>
<boolean name="recovery-log-enabled">false</boolean>
<boolean name="hold-queues">true</boolean>
<integer name="balance-replenish-amount">3000</integer>
<integer name="error-penalty-amount">100</integer>
<long name="queue-total-budget">-1</long>
<string name="cost-policy">org.archive.crawler.frontier.UnitCostAssignmentPolicy</string>
<long name="snooze-deactivate-ms">300000</long>
<integer name="target-ready-backlog">50</integer>
<string name="uri-included-structure">org.archive.crawler.util.BdbUriUniqFilter</string>
<boolean name="dump-pending-at-close">false</boolean>
</newObject>
<map name="uri-canonicalization-rules">
<newObject name="Lowercase" class="org.archive.crawler.url.canonicalize.LowercaseRule">
<boolean name="enabled">true</boolean>
</newObject>
<newObject name="Userinfo" class="org.archive.crawler.url.canonicalize.StripUserinfoRule">
<boolean name="enabled">true</boolean>
</newObject>
<newObject name="WWW" class="org.archive.crawler.url.canonicalize.StripWWWRule">
<boolean name="enabled">false</boolean>
</newObject>
<newObject name="SessionIDs" class="org.archive.crawler.url.canonicalize.StripSessionIDs">
<boolean name="enabled">true</boolean>
</newObject>
<newObject name="QueryStrPrefix" class="org.archive.crawler.url.canonicalize.FixupQueryStr">
<boolean name="enabled">true</boolean>
</newObject>
</map>
<!-- Heritrix pre-fetch processors -->
<map name="pre-fetch-processors">
<newObject name="QuotaEnforcer" class="org.archive.crawler.prefetch.QuotaEnforcer">
<boolean name="force-retire">false</boolean>
<boolean name="enabled">true</boolean>
<newObject name="QuotaEnforcer#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<long name="server-max-fetch-successes">-1</long>
<long name="server-max-success-kb">-1</long>
<long name="server-max-fetch-responses">-1</long>
<long name="server-max-all-kb">-1</long>
<long name="host-max-fetch-successes">-1</long>
<long name="host-max-success-kb">-1</long>
<long name="host-max-fetch-responses">-1</long>
<long name="host-max-all-kb">-1</long>
<long name="group-max-fetch-successes">-1</long>
<long name="group-max-success-kb">-1</long>
<long name="group-max-fetch-responses">-1</long>
<long name="group-max-all-kb">-1</long>
</newObject>
<newObject name="Preselector" class="org.archive.crawler.prefetch.Preselector">
<boolean name="enabled">true</boolean>
<newObject name="Preselector#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<boolean name="override-logger">false</boolean>
<boolean name="recheck-scope">true</boolean>
<boolean name="block-all">false</boolean>
<string name="block-by-regexp"/>
<string name="allow-by-regexp"/>
</newObject>
<newObject name="Preprocessor" class="org.archive.crawler.prefetch.PreconditionEnforcer">
<boolean name="enabled">true</boolean>
<newObject name="Preprocessor#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<integer name="ip-validity-duration-seconds">21600</integer>
<integer name="robot-validity-duration-seconds">86400</integer>
<boolean name="calculate-robots-only">false</boolean>
</newObject>
</map> <!--End of Heritrix pre-fetch processors -->
<!-- Heritrix fetch processors -->
<map name="fetch-processors">
<newObject name="DNS">
<boolean name="enabled">true</boolean>
<newObject name="DNS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<boolean name="accept-non-dns-resolves">false</boolean>
<boolean name="digest-content">true</boolean>
<string name="digest-algorithm">sha1</string>
</newObject>
<newObject name="HTTP">
<boolean name="enabled">true</boolean>
<newObject name="HTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<newObject name="midfetch-decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map name="rules">
</map>
</newObject>
<integer name="timeout-seconds">1200</integer>
<integer name="sotimeout-ms">20000</integer>
<integer name="fetch-bandwidth">0</integer>
<long name="max-length-bytes">0</long>
<boolean name="ignore-cookies">false</boolean>
<boolean name="use-bdb-for-cookies">true</boolean>
<string name="load-cookies-from-file"/>
<string name="save-cookies-to-file"/>
<string name="trust-level">open</string>
<stringList name="accept-headers">
</stringList>
<string/>
<string name="http-proxy-port"/>
<string name="default-encoding">ISO-8859-1</string>
<boolean name="digest-content">true</boolean>
<string name="digest-algorithm">sha1</string>
<boolean name="send-if-modified-since">true</boolean>
<boolean name="send-if-none-match">true</boolean>
<boolean name="send-connection-close">true</boolean>
<boolean name="send-referer">true</boolean>
<boolean>false</boolean>
<string name="http-bind-address"/>
</newObject>
</map> <!-- end of Heritrix Fetch processors -->
<!-- Heritrix extract processors -->
<map name="extract-processors">
<newObject class="org.archive.crawler.extractor.ExtractorHTTP">
<boolean name="enabled">true</boolean>
<newObject name="ExtractorHTTP#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map name="rules">
</map>
</newObject>
</newObject>
<newObject name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML">
<boolean name="enabled">true</boolean>
<newObject name="ExtractorHTML#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<boolean name="extract-javascript">true</boolean>
<boolean name="treat-frames-as-embed-links">true</boolean>
<boolean name="ignore-form-action-urls">true</boolean>
<boolean name="extract-value-attributes">true</boolean>
<boolean name="ignore-unexpected-html">true</boolean>
</newObject>
<newObject name="ExtractorCSS">
<boolean name="enabled">true</boolean>
<newObject name="ExtractorCSS#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
<newObject name="ExtractorJS" class="org.archive.crawler.extractor.ExtractorJS">
<boolean name="enabled">true</boolean>
<newObject name="ExtractorJS#decide-rules">
<map>
</map>
</newObject>
</newObject>
<newObject name="ExtractorSWF" class="org.archive.crawler.extractor.ExtractorSWF">
<boolean name="enabled">true</boolean>
<newObject name="ExtractorSWF#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
<newObject>
<boolean name="enabled">true</boolean>
<string name="trigger-regexp">(^http.*://.*)//(.*$)</string>
<string name="build-pattern">$1/$2</string>
<boolean name="remove-trigger-uris">false</boolean>
</newObject>
</map> <!-- end of Heritrix extract processors -->
<!-- Heritrix write processors -->
<map name="write-processors">
<newObject class="is.hi.bok.deduplicator.DeDuplicator">
<boolean>true</boolean>
<map>
</map>
<string/>
<string>By URL</string>
<boolean name="try-equivalent">true</boolean>
<boolean name="change-content-size">false</boolean>
<string name="mime-filter">^text/.*</string>
<string name="filter-mode">Blacklist</string>
<string>Timestamp</string>
<string>SEVERE</string>
<string/>
<string>Use index information</string>
<boolean name="stats-per-host">true</boolean>
<boolean>true</boolean>
</newObject>
<newObject name="Archiver" class="org.archive.crawler.writer.ARCWriterProcessor">
<boolean name="enabled">true</boolean>
<newObject name="Archiver#decide-rules">
<map>
</map>
</newObject>
<boolean name="compress">false</boolean>
<string name="prefix">IAH</string>
<string name="suffix">${HOSTNAME}</string>
<integer name="max-size-bytes">100000000</integer>
<stringList name="path">
<string>arcs</string>
</stringList>
<integer name="pool-max-active">5</integer>
<integer name="pool-max-wait">300000</integer>
<long name="total-bytes-to-write">0</long>
<boolean name="skip-identical-digests">false</boolean>
</newObject>
</map> <!-- End of Heritrix write processors -->
<!-- Heritrix post processors -->
<map name="post-processors">
<newObject name="Updater" class="org.archive.crawler.postprocessor.CrawlStateUpdater">
<boolean name="enabled">true</boolean>
<newObject name="Updater#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
<newObject name="LinksScoper">
<boolean name="enabled">true</boolean>
<newObject name="LinksScoper#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
<boolean name="override-logger">false</boolean>
<boolean name="seed-redirects-new-seed">true</boolean>
<integer name="preference-depth-hops">-1</integer>
<newObject name="scope-rejected-url-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
<newObject name="Scheduler" class="org.archive.crawler.postprocessor.FrontierScheduler">
<boolean name="enabled">true</boolean>
<newObject name="Scheduler#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
<newObject name="ContentSize">
<boolean>true</boolean>
<newObject name="ContentSize#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence">
<map>
</map>
</newObject>
</newObject>
</map> <!-- end of Heritrix post processors -->
<map name="loggers">
<newObject name="crawl-statistics" class="org.archive.crawler.admin.StatisticsTracker">
<integer name="interval-seconds">20</integer>
</newObject>
</map>
<string name="recover-path"/>
<boolean name="checkpoint-copy-bdbje-logs">true</boolean>
<boolean name="recover-retain-failures">false</boolean>
<newObject name="credential-store" class="org.archive.crawler.datamodel.CredentialStore">
<map name="credentials">
</map>
</newObject>
</controller>
</crawl-order>