Skip to content
90 changes: 81 additions & 9 deletions html2docx/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,8 @@ def xml(self):
return cElementTree.tostring(self.tree)


class ParagraphParser(object):
html_to_ooxml_tag_conversions = {
'strong': 'bold',
'em': 'italics',
}
class BaseParser(object):
abstract = True

def __init__(self, element):
self.element = element
Expand Down Expand Up @@ -39,17 +36,20 @@ def _parse(self, element, styles):
if element.tail:
yield element.tail, styles[-1]


class ParagraphParser(BaseParser):
html_to_ooxml_tag_conversions = {
'strong': 'bold',
'em': 'italics',
}

def build_runs(self):
for text, styles in self.parse(self.element):
run = Run(text)
for style in styles:
ooxml_style = self.html_to_ooxml_tag_conversions.get(style)
if ooxml_style:
setattr(run.properties, ooxml_style, True)
if 'strong' in styles:
run.properties.bold = True
if 'em' in styles:
run.properties.italics = True
yield run

@property
Expand Down Expand Up @@ -130,3 +130,75 @@ def italics(self, value):
self._italics = True
else:
self._italics = False


class TableParser(BaseParser):
@property
def tag(self):
table_rows = []
for table_row in self.element.findall('tr'):
table_rows.append(TableRowParser(table_row))
return Table(table_rows)


class Table(BaseTag):
tag_name = 'w:tbl'

def __init__(self, table_rows=None):
self.table_rows = table_rows

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.table_rows is None:
return element
for table_row in self.table_rows:
element.append(table_row.tag.tree)
return element


class TableRowParser(BaseParser):
@property
def tag(self):
table_cells = []
for table_cell in self.element.findall('td'):
table_cells.append(TableCellParser(table_cell))
return TableRow(table_cells)


class TableRow(BaseTag):
tag_name = 'w:tr'

def __init__(self, table_cells=None):
self.table_cells = table_cells

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.table_cells is None:
return element
for table_cell in self.table_cells:
element.append(table_cell.tag.tree)
return element


class TableCellParser(BaseParser):
@property
def tag(self):
paragraph = ParagraphParser(self.element)
return TableCell(paragraph)


class TableCell(BaseTag):
tag_name = 'w:tc'

def __init__(self, element=None):
self.element = element

@property
def tree(self):
element = cElementTree.Element(self.tag_name)
if self.element is None:
return element
element.append(self.element.tag.tree)
return element
13 changes: 10 additions & 3 deletions html2docx/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
from jinja2 import Environment, PackageLoader

from html2docx.utils import ZipFile
from html2docx.builder import ParagraphParser
from html2docx.builder import ParagraphParser, TableParser


tag_to_parser_conversions = {
'p': ParagraphParser,
'table': TableParser
}


class HTML2Docx(object):
Expand Down Expand Up @@ -47,8 +53,9 @@ def _convert(self):
if el in self.visited:
continue
self.visited.update([el])
if el.tag == 'p':
parser = ParagraphParser(el)
Parser = tag_to_parser_conversions.get(el.tag)
if Parser:
parser = Parser(el)
self.document_state.append(parser.tag)
self.visited.update(el.getiterator())

Expand Down
3 changes: 3 additions & 0 deletions html2docx/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class TestDocx2Html(Docx2Html):
def style(*args, **kwargs):
return ''

def table(self, text):
return '<table>%s</table>' % text


def build_run(test_name, html):
boiler_plate = '<html><head></head><body>%s</body></html>'
Expand Down
109 changes: 108 additions & 1 deletion html2docx/tests/test_builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
from xml.etree import cElementTree
from unittest import TestCase

from html2docx.builder import RunProperties, ParagraphParser, Paragraph
from html2docx.builder import (
Paragraph,
ParagraphParser,
RunProperties,
Table,
TableCell,
TableCellParser,
TableParser,
TableRow,
TableRowParser,
)


class RunPropertiesTestCase(TestCase):
Expand Down Expand Up @@ -75,3 +85,100 @@ def test_empty(self):

xml = paragraph.xml
self.assertEqual(xml, expected_xml)


class TableCellParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<td>AAA</td>')
parser = TableCellParser(element)
xml = parser.tag.xml
expected_xml = '<w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<td><strong>AAA</strong></td>')
parser = TableCellParser(element)
xml = parser.tag.xml
expected_xml = '<w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc>' # noqa

self.assertEqual(xml, expected_xml)


class TableCellTestCase(TestCase):
def test_empty(self):
table_cell = TableCell()
expected_xml = '<w:tc />'

xml = table_cell.xml
self.assertEqual(xml, expected_xml)


class TableRowParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<tr><td>AAA</td></tr>')
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<tr><td><strong>AAA</strong></td></tr>') # noqa
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)

def test_multiple_cells(self):
element = cElementTree.fromstring('<tr><td>AAA</td><td>BBB</td></tr>')
parser = TableRowParser(element)
xml = parser.tag.xml
expected_xml = '<w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>BBB</w:t></w:r></w:p></w:tc></w:tr>' # noqa

self.assertEqual(xml, expected_xml)


class TableRowTestCase(TestCase):
def test_empty(self):
table_row = TableRow()
expected_xml = '<w:tr />'

xml = table_row.xml
self.assertEqual(xml, expected_xml)


class TableParserTestCase(TestCase):
def test_simple(self):
element = cElementTree.fromstring('<table><tr><td>AAA</td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)

def test_with_style(self):
element = cElementTree.fromstring('<table><tr><td><strong>AAA</strong></td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr><w:b /></w:rPr><w:t>AAA</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)

def test_multiple_cells(self):
element = cElementTree.fromstring('<table><tr><td>AAA</td><td>BBB</td></tr><tr><td>CCC</td><td>DDD</td></tr></table>') # noqa
parser = TableParser(element)
xml = parser.tag.xml
expected_xml = '<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>AAA</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>BBB</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:rPr /><w:t>CCC</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:rPr /><w:t>DDD</w:t></w:r></w:p></w:tc></w:tr></w:tbl>' # noqa

self.assertEqual(xml, expected_xml)


class TableTestCase(TestCase):
def test_empty(self):
table_row = Table()
expected_xml = '<w:tbl />'

xml = table_row.xml
self.assertEqual(xml, expected_xml)
24 changes: 24 additions & 0 deletions html2docx/tests/test_complex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from html2docx.tests import build_run


test_cases = [
(
'Test paragraph, table, paragraph.',
'<p>AAA</p><table><tr><td>BBB</td></tr></table><p>CCC</p>',
),
(
'Test table, table, paragraph',
'<table><tr><td>AAA</td></tr></table><table><tr><td>BBB</td></tr></table><p>CCC</p>', # noqa
),
# Nesting doesn't really work yet.
# (
# 'Test Nested Table',
# '<table><tr><td>AAA</td><td><table><tr><td>BBB</td></tr></table></td></tr></table>', # noqa
# ),
]


def test():
for test_name, html in test_cases:
run = build_run(test_name, html)
yield run
27 changes: 27 additions & 0 deletions html2docx/tests/test_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from html2docx.tests import build_run


test_cases = [
(
'Test simple table.',
'<table><tr><td>AAA</td></tr></table>',
),
(
'Test multiple rows.',
'<table><tr><td>AAA</td></tr><tr><td>BBB</td></tr></table>',
),
(
'Test multiple cells.',
'<table><tr><td>AAA</td><td>BBB</td></tr></table>',
),
(
'Test multiple rows and cells.',
'<table><tr><td>AAA</td><td>BBB</td></tr><tr><td>CCC</td><td>DDD</td></tr></table>', # noqa
),
]


def test():
for test_name, html in test_cases:
run = build_run(test_name, html)
yield run
4 changes: 1 addition & 3 deletions run_tests.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#! /bin/sh

RUN_TESTS='nosetests -v -v --with-coverage --cover-erase --cover-package=. html2docx'
echo $RUN_TESTS
$RUN_TESTS
nosetests -v -v --with-coverage --cover-erase --cover-package=html2docx html2docx && find -name '*.py' | xargs flake8