add 'text.parse_float()' + cleanup in text.py

6 years ago · 2d2953a5bf
parent 0c32dc5858
commit 2d2953a5bf
4 changed files with 44 additions and 13 deletions
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@ -103,7 +103,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
            "gallery_id": text.parse_int(self.gallery_id),
            "title": text.unescape(title or ""),
            "user": ", ".join(users),
-            "fields": [f for f in text.split_html(fields) if f != ", "],
+            "fields": [f for f in text.split_html(fields) if f != ","],
            "date": text.parse_int(date),
            "views": text.parse_int(stats[0]),
            "votes": text.parse_int(stats[1]),
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2014-2018 Mike Fährmann
+# Copyright 2014-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -88,7 +88,7 @@ class SankakuExtractor(SharedConfigExtractor):
            "id": text.parse_int(post_id),
            "md5": file_url.rpartition("/")[2].partition(".")[0],
            "tags": text.unescape(tags),
-            "vote_average": float(vavg or 0),
+            "vote_average": text.parse_float(vavg),
            "vote_count": text.parse_int(vcnt),
            "created_at": created,
            "rating": (rating or "?")[0].lower(),
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-"""Collection of functions that work in strings/text"""
+"""Collection of functions that work on strings/text"""

 import re
 import html
@ -47,7 +47,7 @@ def split_html(txt, sep=None):
    """Split input string by html-tags"""
    try:
        return [
-            x for x in re.split("<[^>]+>", txt)
+            x.strip() for x in re.split("<[^>]+>", txt)
            if x and not x.isspace()
        ]
    except TypeError:
@ -165,6 +165,16 @@ def parse_int(value, default=0):
        return default


+def parse_float(value, default=0.0):
+    """Convert 'value' to float"""
+    if not value:
+        return default
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return default
+
+
 def parse_query(qs):
    """Parse a query string into key-value pairs"""
    result = {}
@ -182,12 +192,11 @@ if os.name == "nt":
 else:
    clean_path = clean_path_posix

+
 urljoin = urllib.parse.urljoin
+
+quote = urllib.parse.quote
 unquote = urllib.parse.unquote
-escape = html.escape

-try:
-    unescape = html.unescape
-except AttributeError:
-    import html.parser
-    unescape = html.parser.HTMLParser().unescape
+escape = html.escape
+unescape = html.unescape
--- a/test/test_text.py
+++ b/test/test_text.py
@ -71,8 +71,9 @@ class TestText(unittest.TestCase):
        # standard usage
        self.assertEqual(f(""), empty)
        self.assertEqual(f("Hello World."), ["Hello World."])
-        self.assertEqual(f(" Hello  World.  "), [" Hello  World.  "])
+        self.assertEqual(f(" Hello  World.  "), ["Hello  World."])
        self.assertEqual(f("Hello<br/>World."), result)
+        self.assertEqual(f(" Hello <br/> World.  "), result)
        self.assertEqual(
            f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)

@ -260,6 +261,27 @@ class TestText(unittest.TestCase):
            self.assertEqual(f(value, default), default)
        self.assertEqual(f("zzz", default), default)

+    def test_parse_float(self, f=text.parse_float):
+        self.assertEqual(f(0), 0.0)
+        self.assertEqual(f("0"), 0.0)
+        self.assertEqual(f(123), 123.0)
+        self.assertEqual(f("123"), 123.0)
+        self.assertEqual(f(123.456), 123.456)
+        self.assertEqual(f("123.456"), 123.456)
+
+        # invalid arguments
+        for value in INVALID_ALT:
+            self.assertEqual(f(value), 0.0)
+        self.assertEqual(f("zzz"), 0.0)
+        self.assertEqual(f([1, 2, 3]), 0.0)
+        self.assertEqual(f({1: 2, 3: 4}), 0.0)
+
+        # 'default' argument
+        default = "default"
+        for value in INVALID_ALT:
+            self.assertEqual(f(value, default), default)
+        self.assertEqual(f("zzz", default), default)
+
    def test_parse_query(self, f=text.parse_query):
        # standard usage
        self.assertEqual(f(""), {})