Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
[![Build Status](https://travis-ci.org/eentzel/htmltruncate.py.png)](https://travis-ci.org/eentzel/htmltruncate.py)
## htmltruncate

A module to truncate strings containing HTML.
[![Build Status](https://travis-ci.org/chadpaulson/htmltruncate.png)](https://travis-ci.org/chadpaulson/htmltruncate.py)

Returns a truncated string while preserving HTML markup (which does not count towards length). All tags left open by truncation are closed.

**Example**:

```python
>>> import htmltruncate
>>> str = "<p>You're not gonna lose the house, <b>everybody</b> has three mortgages nowadays.</p>"
>>> htmltruncate.truncate(str, 33)
"<p>You're not gonna lose the house, </p>"
```

**Options**:

```python
htmltruncate.truncate(str, target_len, ellipsis='')
>>> htmltruncate.truncate(str, 33, full_word=True, ellipsis="...")
"<p>You're not gonna lose the house, <b>everybody</b></p>..."
```
Returns a copy of str truncated to target_len characters,
preserving HTML markup (which does not count towards the length).
Any tags that would be left open by truncation will be closed at
the end of the returned string. Optionally append ellipsis if
the string was truncated.
22 changes: 13 additions & 9 deletions htmltruncate.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python

from __future__ import print_function
import sys


Expand All @@ -16,14 +15,14 @@ def __init__(self, tag, rest=''):

def as_string(self):
return '<' + self.tag + self.rest + '>'

class CloseTag(OpenTag):
def as_string(self):
return '</' + self.tag + '>'

class SelfClosingTag(OpenTag):
pass

class Tokenizer:
def __init__(self, input):
self.input = input
Expand All @@ -32,7 +31,7 @@ def __init__(self, input):
def __next_char(self):
self.counter += 1
return self.input[self.counter]

def next_token(self):
try:
char = self.input[self.counter]
Expand Down Expand Up @@ -62,7 +61,7 @@ def __entity(self):
entity.append(';')
self.counter += 1
return ''.join(entity)

def __open_tag(self):
"""Return an open/close tag token.
Precondition: self.counter points at the first character of the tag name
Expand Down Expand Up @@ -97,7 +96,7 @@ def __close_tag(self):
self.counter += 1
return CloseTag( ''.join(tag) )

def truncate(str, target_len, ellipsis = ''):
def truncate(str, target_len, full_word=False, ellipsis = ''):
"""Returns a copy of str truncated to target_len characters,
preserving HTML markup (which does not count towards the length).
Any tags that would be left open by truncation will be closed at
Expand All @@ -108,7 +107,7 @@ def truncate(str, target_len, ellipsis = ''):
length = 0 # number of characters (not counting markup) placed in retval so far
tokens = Tokenizer(str)
tok = tokens.next_token()
while tok != END and length < target_len:
while tok != END:
if tok.__class__.__name__ == 'OpenTag':
stack.append(tok)
retval.append( tok.as_string() )
Expand All @@ -124,13 +123,18 @@ def truncate(str, target_len, ellipsis = ''):
retval.append(tok)
length += 1
tok = tokens.next_token()
if length == target_len and not full_word:
break
elif length >= target_len and full_word and tok == " ":
break

while len(stack) > 0:
tok = CloseTag( stack.pop().tag )
retval.append( tok.as_string() )
if length == target_len:
if len(str) > length:
return ''.join(retval) + ellipsis
else:
return ''.join(retval)
return ''.join(retval)

if __name__ == "__main__":
try:
Expand Down
12 changes: 12 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

from setuptools import setup

setup(name='htmltruncate',
version='1.0',
description='To truncate html content meaningfull',
author='Eric Entzel',
url='https://github.com/eentzel/htmltruncate.py',
py_modules = ['htmltruncate']
)
5 changes: 4 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def testSelfClosing(self):
self.assertEqual( htmltruncate.truncate( "I need<br /> a break", 11 ), "I need<br /> a br" )

def testEllipsis(self):
self.assertEqual( htmltruncate.truncate('this <b>word</b> is bolded', 10, '...' ), "this <b>word</b> ...")
self.assertEqual( htmltruncate.truncate('this <b>word</b> is bolded', 10, ellipsis='...' ), "this <b>word</b> ...")

def testFullWord(self):
self.assertEqual( htmltruncate.truncate( "I need<br /> a break", 11, full_word=True ), "I need<br /> a break" )

if __name__ == "__main__":
unittest.main()