Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions chap4-hbase/generate_wiki_links.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information.
#---

import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.ConnectionFactory'
import 'org.apache.hadoop.hbase.TableName'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.client.Scan'
import 'org.apache.hadoop.hbase.client.Durability'
import 'org.apache.hadoop.hbase.util.Bytes'

def jbytes( *args )
return args.map { |arg| arg.to_s.to_java_bytes }
end

wiki_table = HTable.new( @hbase.configuration, 'wiki' )
links_table = HTable.new( @hbase.configuration, 'links' )
links_table.setAutoFlush( false )
connection = ConnectionFactory.createConnection()

wiki_table = connection.getTable( TableName.valueOf( "wiki" ) )
links_table = connection.getBufferedMutator( TableName.valueOf( "links" ) )

scanner = wiki_table.getScanner( Scan.new ) # (1)

Expand All @@ -35,29 +38,33 @@ def jbytes( *args )
text.scan(linkpattern) do |target, label| # (3)
unless put_to
put_to = Put.new( *jbytes( title ) )
put_to.setWriteToWAL( false )
put_to.setDurability(Durability::SKIP_WAL)
end

target.strip!
target.capitalize!

if target.length == 0
next
end

label = '' unless label
label.strip!

put_to.add( *jbytes( "to", target, label ) )
put_to.addColumn( *jbytes( "to", target, label ) )
put_from = Put.new( *jbytes( target ) )
put_from.add( *jbytes( "from", title, label ) )
put_from.setWriteToWAL( false )
links_table.put( put_from ) # (4)
put_from.addColumn( *jbytes( "from", title, label ) )
put_to.setDurability(Durability::SKIP_WAL)
links_table.mutate( put_from ) # (4)
end
links_table.put( put_to ) if put_to # (5)
links_table.flushCommits()
links_table.mutate( put_to ) if put_to and not put_to.isEmpty # (5)
links_table.flush()
end
count += 1
puts "#{count} pages processed (#{title})" if count % 500 == 0

end
links_table.flushCommits()
links_table.flush()
exit

21 changes: 12 additions & 9 deletions chap4-hbase/import_from_wikipedia.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

require 'time'

import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.client.ConnectionFactory'
import 'org.apache.hadoop.hbase.TableName'
import 'javax.xml.stream.XMLStreamConstants'

def jbytes( *args )
Expand All @@ -24,8 +25,10 @@ def jbytes( *args )
buffer = nil
count = 0

table = HTable.new( @hbase.configuration, 'wiki' )
table.setAutoFlush( false ) # (2)
connection = ConnectionFactory.createConnection()

table = connection.getBufferedMutator( TableName.valueOf( "wiki" ) ) # (2)


while reader.has_next
type = reader.next
Expand All @@ -51,21 +54,21 @@ def jbytes( *args )
ts = ( Time.parse document['timestamp'] ).to_i

p = Put.new( key, ts )
p.add( *jbytes( "text", "", document['text'] ) )
p.add( *jbytes( "revision", "author", document['username'] ) )
p.add( *jbytes( "revision", "comment", document['comment'] ) )
table.put( p )
p.addColumn( *jbytes( "text", "", document['text'] ) )
p.addColumn( *jbytes( "revision", "author", document['username'] ) )
p.addColumn( *jbytes( "revision", "comment", document['comment'] ) )
table.mutate( p )

count += 1
table.flushCommits() if count % 10 == 0
table.flush() if count % 10 == 0
if count % 500 == 0
puts "#{count} records inserted (#{document['title']})"
end
end
end
end

table.flushCommits()
table.flush()
exit


Expand Down
14 changes: 8 additions & 6 deletions chap4-hbase/put_multiple_columns.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@
# We make no guarantees that this code is fit for any purpose.
# Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information.
#---
import 'org.apache.hadoop.hbase.client.HTable'
import 'org.apache.hadoop.hbase.client.Put'
import 'org.apache.hadoop.hbase.client.ConnectionFactory'
import 'org.apache.hadoop.hbase.TableName'

def jbytes( *args )
args.map { |arg| arg.to_s.to_java_bytes }
end

table = HTable.new( @hbase.configuration, "wiki" )
connection = ConnectionFactory.createConnection()

table = connection.getTable( TableName.valueOf( "wiki" ) )

p = Put.new( *jbytes( "Home" ) )

p.add( *jbytes( "text", "", "Hello world" ) )
p.add( *jbytes( "revision", "author", "jimbo" ) )
p.add( *jbytes( "revision", "comment", "my first edit" ) )
p.addColumn( *jbytes( "text", "", "Hello world" ) )
p.addColumn( *jbytes( "revision", "author", "jimbo" ) )
p.addColumn( *jbytes( "revision", "comment", "my first edit" ) )

table.put( p )