test fscrawler

#download package
[root@localhost fs]# wget https://oss.sonatype.org/content/repositories/snapshots/fr/pilato/elasticsearch/crawler/fscrawler-es7/2.7-SNAPSHOT/fscrawler-es7-2.7-20200403.145039-96.zip
--2020-04-08 09:52:01--  https://oss.sonatype.org/content/repositories/snapshots/fr/pilato/elasticsearch/crawler/fscrawler-es7/2.7-SNAPSHOT/fscrawler-es7-2.7-20200403.145039-96.zip
Resolving oss.sonatype.org (oss.sonatype.org)... 34.225.164.131, 52.20.193.6, 3.228.150.45, ...
Connecting to oss.sonatype.org (oss.sonatype.org)|34.225.164.131|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 96052581 (92M) [application/zip]
Saving to: ‘fscrawler-es7-2.7-20200403.145039-96.zip’

100%[===========================================================================================================================================================>] 96,052,581  9.58MB/s   in 12s

2020-04-08 09:52:17 (7.59 MB/s) - ‘fscrawler-es7-2.7-20200403.145039-96.zip’ saved [96052581/96052581]#make config

#config package

[root@localhost fscrawler-es7-2.7-SNAPSHOT]
# vi bin/fscrawler
#!/bin/sh
CDPATH=""
SCRIPT="$0"
JAVA_HOME="/usr/share/java-se-8u41-ri"

[root@localhost fscrawler-es7-2.7-SNAPSHOT]# bin/fscrawler --config_dir ./config fs_index
10:13:31,288 INFO  [f.p.e.c.f.c.BootstrapChecks] Memory [Free/Total=Percent]: HEAP [55mb/782.5mb=7.04%], RAM [104mb/3.4gb=2.96%], Swap [1.9gb/1.9gb=99.27%].
10:13:31,330 WARN  [f.p.e.c.f.c.FsCrawlerCli] job [fs_index] does not exist
10:13:31,331 INFO  [f.p.e.c.f.c.FsCrawlerCli] Do you want to create it (Y/N)?
y
10:13:48,080 INFO  [f.p.e.c.f.c.FsCrawlerCli] Settings have been created in [./config/fs_index/_settings.yaml]. Please review and edit before relaunch
[root@localhost fscrawler-es7-2.7-SNAPSHOT]# 
vi ./config/fs_index/_settings.yaml---
name: "fs_index"
fs:
  url: "/home/weblogic/docs"
  update_rate: "15m"
  excludes:
  - "*/~*"
  json_support: false
  filename_as_id: false
  add_filesize: true
  remove_deleted: true
  add_as_inner_object: false
  store_source: false
  index_content: true
  attributes_support: false
  raw_metadata: false
  xml_support: false
  index_folders: true
  lang_detect: false
  continue_on_error: false
  ocr:
    language: "eng"
    enabled: true
    pdf_strategy: "ocr_and_text"
  follow_symlinks: false
elasticsearch:
  nodes:
  - url: "http://192.168.0.102:9200"
  bulk_size: 100
  flush_interval: "5s"
  byte_size: "10mb"

#config tokenizer
[root@localhost 7]# pwd
/root/fscrawler-es7-2.7-SNAPSHOT/config/_default/7
[root@localhost 7]# ll
합계 12
-rw-r--r--. 1 root root 4676  4월  8 10:13 _settings.json
-rw-r--r--. 1 root root  538  4월  8 10:13 _settings_folder.json




#config package

[root@localhost fscrawler-es7-2.7-SNAPSHOT]
# vi bin/fscrawler#!/bin/sh
C


댓글

이 블로그의 인기 게시물

[!] CDN: trunk URL couldn't be downloaded: https://cdn.cocoapods.org/CocoaPods-version.yml Response: URL using bad/illegal format or missing URL

starcraft map

Data Analysis with Superset - boardless chart