249 lines
7.4 KiB
Ruby
249 lines
7.4 KiB
Ruby
require 'fileutils'
|
|
require 'net/http'
|
|
require 'json'
|
|
require 'uri'
|
|
require 'v8'
|
|
|
|
module Jekyll
|
|
module LunrJsSearch
|
|
class Indexer < Jekyll::Generator
|
|
def initialize(config = {})
|
|
super(config)
|
|
|
|
lunr_config = {
|
|
'excludes' => [],
|
|
'strip_index_html' => false,
|
|
'min_length' => 3,
|
|
'stopwords' => 'stopwords.txt',
|
|
'fields' => {
|
|
'title' => 10,
|
|
'tags' => 20,
|
|
'body' => 1
|
|
},
|
|
'js_dir' => 'js'
|
|
}.merge!(config['lunr_search'] || {})
|
|
|
|
@js_dir = lunr_config['js_dir']
|
|
gem_lunr = File.join(File.dirname(__FILE__), "../../build/lunr.min.js")
|
|
@lunr_path = File.exist?(gem_lunr) ? gem_lunr : File.join(@js_dir, File.basename(gem_lunr))
|
|
raise "Could not find #{@lunr_path}" if !File.exist?(@lunr_path)
|
|
|
|
ctx = V8::Context.new
|
|
ctx.load(@lunr_path)
|
|
ctx['indexer'] = proc do |this|
|
|
this.ref('id')
|
|
lunr_config['fields'].each_pair do |name, boost|
|
|
this.field(name, { 'boost' => boost })
|
|
end
|
|
end
|
|
@index = ctx.eval('lunr(indexer)')
|
|
@lunr_version = ctx.eval('lunr.version')
|
|
@docs = {}
|
|
@excludes = lunr_config['excludes']
|
|
|
|
# if web host supports index.html as default doc, then optionally exclude it from the url
|
|
@strip_index_html = lunr_config['strip_index_html']
|
|
|
|
# stop word exclusion configuration
|
|
@min_length = lunr_config['min_length']
|
|
@stopwords_file = lunr_config['stopwords']
|
|
end
|
|
|
|
# Index all pages except pages matching any value in config['lunr_excludes'] or with date['exclude_from_search']
|
|
# The main content from each page is extracted and saved to disk as json
|
|
def generate(site)
|
|
Jekyll.logger.info "Lunr:", 'Creating search index...'
|
|
|
|
@site = site
|
|
# gather pages and posts
|
|
items = pages_to_index(site)
|
|
content_renderer = PageRenderer.new(site)
|
|
index = []
|
|
|
|
items.each_with_index do |item, i|
|
|
entry = SearchEntry.create(item, content_renderer)
|
|
|
|
entry.strip_index_suffix_from_url! if @strip_index_html
|
|
entry.strip_stopwords!(stopwords, @min_length) if File.exists?(@stopwords_file)
|
|
|
|
doc = {
|
|
"id" => i,
|
|
"title" => entry.title,
|
|
"url" => entry.url,
|
|
"date" => entry.date,
|
|
"categories" => entry.categories,
|
|
"body" => entry.body
|
|
}
|
|
@index.add(doc)
|
|
doc.delete("body")
|
|
@docs[i] = doc
|
|
|
|
Jekyll.logger.debug "Lunr:", (entry.title ? "#{entry.title} (#{entry.url})" : entry.url)
|
|
end
|
|
|
|
FileUtils.mkdir_p(File.join(site.dest, @js_dir))
|
|
filename = File.join(@js_dir, 'index.json')
|
|
|
|
total = {
|
|
"docs" => @docs,
|
|
"index" => @index.to_hash
|
|
}
|
|
|
|
filepath = File.join(site.dest, filename)
|
|
File.open(filepath, "w") { |f| f.write(total.to_json(:max_nesting => 150)) }
|
|
Jekyll.logger.info "Lunr:", "Index ready (lunr.js v#{@lunr_version})"
|
|
added_files = [filename]
|
|
|
|
site_js = File.join(site.dest, @js_dir)
|
|
# If we're using the gem, add the lunr and search JS files to the _site
|
|
if File.expand_path(site_js) != File.dirname(@lunr_path)
|
|
extras = Dir.glob(File.join(File.dirname(@lunr_path), "*.min.js"))
|
|
FileUtils.cp(extras, site_js)
|
|
extras.map! { |min| File.join(@js_dir, File.basename(min)) }
|
|
Jekyll.logger.debug "Lunr:", "Added JavaScript to #{@js_dir}"
|
|
added_files.push(*extras)
|
|
end
|
|
|
|
# Keep the written files from being cleaned by Jekyll
|
|
added_files.each do |filename|
|
|
site.static_files << SearchIndexFile.new(site, site.dest, "/", filename)
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
# load the stopwords file
|
|
def stopwords
|
|
@stopwords ||= IO.readlines(@stopwords_file).map { |l| l.strip }
|
|
end
|
|
|
|
def output_ext(doc)
|
|
if doc.is_a?(Jekyll::Document)
|
|
Jekyll::Renderer.new(@site, doc).output_ext
|
|
else
|
|
doc.output_ext
|
|
end
|
|
end
|
|
|
|
def pages_to_index(site)
|
|
items = []
|
|
|
|
# deep copy pages
|
|
site.pages.each {|page| items << page.dup }
|
|
site.posts.each {|post| items << post.dup }
|
|
site.documents.each {|document| items << document.dup }
|
|
|
|
# only process files that will be converted to .html and only non excluded files
|
|
items.select! {|i| output_ext(i) == '.html' && ! @excludes.any? {|s| (i.url =~ Regexp.new(s)) != nil } }
|
|
items.reject! {|i| i.data['exclude_from_search'] }
|
|
|
|
items
|
|
end
|
|
end
|
|
end
|
|
end
|
|
require "v8"
|
|
require "json"
|
|
|
|
class V8::Object
|
|
def to_json
|
|
@context['JSON']['stringify'].call(self)
|
|
end
|
|
|
|
def to_hash
|
|
JSON.parse(to_json, :max_nesting => 150)
|
|
end
|
|
end
|
|
require 'nokogiri'
|
|
|
|
module Jekyll
|
|
module LunrJsSearch
|
|
class PageRenderer
|
|
def initialize(site)
|
|
@site = site
|
|
end
|
|
|
|
def prepare(item)
|
|
if item.is_a?(Jekyll::Document)
|
|
Jekyll::Renderer.new(@site, item).run
|
|
else
|
|
item.data = item.data.dup
|
|
item.data.delete("layout")
|
|
item.render({}, @site.site_payload)
|
|
item.output
|
|
end
|
|
end
|
|
|
|
# render the item, parse the output and get all text inside <p> elements
|
|
def render(item)
|
|
item.render(@site.layouts, @site.site_payload)
|
|
doc = Nokogiri::HTML(item.output)
|
|
article = doc.search('article').map {|t| t.content }
|
|
article.join(" ").gsub("\r", " ").gsub("\n", " ").gsub("\t", " ").gsub(/\s+/, " ").split.join(" ")
|
|
end
|
|
end
|
|
end
|
|
end
|
|
require 'nokogiri'
|
|
|
|
module Jekyll
|
|
module LunrJsSearch
|
|
class SearchEntry
|
|
def self.create(page_or_post, renderer)
|
|
case page_or_post
|
|
when Jekyll::Post
|
|
date = page_or_post.date
|
|
categories = page_or_post.categories
|
|
when Jekyll::Page, Jekyll::Document
|
|
date = nil
|
|
categories = []
|
|
else
|
|
raise 'Not supported'
|
|
end
|
|
title, url = extract_title_and_url(page_or_post)
|
|
body = renderer.render(page_or_post)
|
|
|
|
SearchEntry.new(title, url, date, categories, body, renderer)
|
|
end
|
|
|
|
def self.extract_title_and_url(item)
|
|
data = item.to_liquid
|
|
[ data['title'], data['url'] ]
|
|
end
|
|
|
|
attr_reader :title, :url, :date, :categories, :body, :collection
|
|
|
|
def initialize(title, url, date, categories, body, collection)
|
|
@title, @url, @date, @categories, @body, @collection = title, url, date, categories, body, collection
|
|
end
|
|
|
|
def strip_index_suffix_from_url!
|
|
@url.gsub!(/index\.html$/, '')
|
|
end
|
|
|
|
# remove anything that is in the stop words list from the text to be indexed
|
|
def strip_stopwords!(stopwords, min_length)
|
|
@body = @body.split.delete_if() do |x|
|
|
t = x.downcase.gsub(/[^a-z]/, '')
|
|
t.length < min_length || stopwords.include?(t)
|
|
end.join(' ')
|
|
end
|
|
end
|
|
end
|
|
end
|
|
module Jekyll
|
|
module LunrJsSearch
|
|
class SearchIndexFile < Jekyll::StaticFile
|
|
# Override write as the index.json index file has already been created
|
|
def write(dest)
|
|
true
|
|
end
|
|
end
|
|
end
|
|
end
|
|
module Jekyll
|
|
module LunrJsSearch
|
|
VERSION = "0.3.0"
|
|
end
|
|
end
|