diff --git a/chap4-hbase/generate_wiki_links.rb b/chap4-hbase/generate_wiki_links.rb index f5767e6..ccd2304 100644 --- a/chap4-hbase/generate_wiki_links.rb +++ b/chap4-hbase/generate_wiki_links.rb @@ -7,18 +7,21 @@ # Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information. #--- -import 'org.apache.hadoop.hbase.client.HTable' +import 'org.apache.hadoop.hbase.client.ConnectionFactory' +import 'org.apache.hadoop.hbase.TableName' import 'org.apache.hadoop.hbase.client.Put' import 'org.apache.hadoop.hbase.client.Scan' +import 'org.apache.hadoop.hbase.client.Durability' import 'org.apache.hadoop.hbase.util.Bytes' def jbytes( *args ) return args.map { |arg| arg.to_s.to_java_bytes } end -wiki_table = HTable.new( @hbase.configuration, 'wiki' ) -links_table = HTable.new( @hbase.configuration, 'links' ) -links_table.setAutoFlush( false ) +connection = ConnectionFactory.createConnection() + +wiki_table = connection.getTable( TableName.valueOf( "wiki" ) ) +links_table = connection.getBufferedMutator( TableName.valueOf( "links" ) ) scanner = wiki_table.getScanner( Scan.new ) # (1) @@ -35,29 +38,33 @@ def jbytes( *args ) text.scan(linkpattern) do |target, label| # (3) unless put_to put_to = Put.new( *jbytes( title ) ) - put_to.setWriteToWAL( false ) + put_to.setDurability(Durability::SKIP_WAL) end target.strip! target.capitalize! + if target.length == 0 + next + end + label = '' unless label label.strip! - put_to.add( *jbytes( "to", target, label ) ) + put_to.addColumn( *jbytes( "to", target, label ) ) put_from = Put.new( *jbytes( target ) ) - put_from.add( *jbytes( "from", title, label ) ) - put_from.setWriteToWAL( false ) - links_table.put( put_from ) # (4) + put_from.addColumn( *jbytes( "from", title, label ) ) + put_to.setDurability(Durability::SKIP_WAL) + links_table.mutate( put_from ) # (4) end - links_table.put( put_to ) if put_to # (5) - links_table.flushCommits() - + links_table.mutate( put_to ) if put_to and not put_to.isEmpty # (5) + links_table.flush() + end count += 1 puts "#{count} pages processed (#{title})" if count % 500 == 0 end -links_table.flushCommits() +links_table.flush() exit diff --git a/chap4-hbase/import_from_wikipedia.rb b/chap4-hbase/import_from_wikipedia.rb index 8fa4ae1..b724ccb 100644 --- a/chap4-hbase/import_from_wikipedia.rb +++ b/chap4-hbase/import_from_wikipedia.rb @@ -9,8 +9,9 @@ require 'time' -import 'org.apache.hadoop.hbase.client.HTable' import 'org.apache.hadoop.hbase.client.Put' +import 'org.apache.hadoop.hbase.client.ConnectionFactory' +import 'org.apache.hadoop.hbase.TableName' import 'javax.xml.stream.XMLStreamConstants' def jbytes( *args ) @@ -24,8 +25,10 @@ def jbytes( *args ) buffer = nil count = 0 -table = HTable.new( @hbase.configuration, 'wiki' ) -table.setAutoFlush( false ) # (2) +connection = ConnectionFactory.createConnection() + +table = connection.getBufferedMutator( TableName.valueOf( "wiki" ) ) # (2) + while reader.has_next type = reader.next @@ -51,13 +54,13 @@ def jbytes( *args ) ts = ( Time.parse document['timestamp'] ).to_i p = Put.new( key, ts ) - p.add( *jbytes( "text", "", document['text'] ) ) - p.add( *jbytes( "revision", "author", document['username'] ) ) - p.add( *jbytes( "revision", "comment", document['comment'] ) ) - table.put( p ) + p.addColumn( *jbytes( "text", "", document['text'] ) ) + p.addColumn( *jbytes( "revision", "author", document['username'] ) ) + p.addColumn( *jbytes( "revision", "comment", document['comment'] ) ) + table.mutate( p ) count += 1 - table.flushCommits() if count % 10 == 0 + table.flush() if count % 10 == 0 if count % 500 == 0 puts "#{count} records inserted (#{document['title']})" end @@ -65,7 +68,7 @@ def jbytes( *args ) end end -table.flushCommits() +table.flush() exit diff --git a/chap4-hbase/put_multiple_columns.rb b/chap4-hbase/put_multiple_columns.rb index ac92525..6090126 100644 --- a/chap4-hbase/put_multiple_columns.rb +++ b/chap4-hbase/put_multiple_columns.rb @@ -6,20 +6,22 @@ # We make no guarantees that this code is fit for any purpose. # Visit http://www.pragmaticprogrammer.com/titles/rwdata for more book information. #--- -import 'org.apache.hadoop.hbase.client.HTable' import 'org.apache.hadoop.hbase.client.Put' +import 'org.apache.hadoop.hbase.client.ConnectionFactory' +import 'org.apache.hadoop.hbase.TableName' def jbytes( *args ) args.map { |arg| arg.to_s.to_java_bytes } end -table = HTable.new( @hbase.configuration, "wiki" ) +connection = ConnectionFactory.createConnection() + +table = connection.getTable( TableName.valueOf( "wiki" ) ) p = Put.new( *jbytes( "Home" ) ) -p.add( *jbytes( "text", "", "Hello world" ) ) -p.add( *jbytes( "revision", "author", "jimbo" ) ) -p.add( *jbytes( "revision", "comment", "my first edit" ) ) +p.addColumn( *jbytes( "text", "", "Hello world" ) ) +p.addColumn( *jbytes( "revision", "author", "jimbo" ) ) +p.addColumn( *jbytes( "revision", "comment", "my first edit" ) ) table.put( p ) -