Ich möchte einen Text filter bauen der anhand von Keywords, Sätze/Texte vordefinierten Kategorien zuordnet. Als Dictionary dient ein json file wo die ganzen Klassen mit den zugehörigen Keywords gelistet sind.
Der python code:
Code: Alles auswählen
import json
import re
# convenience function: check if any topic term is contained within a sentence
def contains(a_list, a_sentence):
for item in a_list:
# handle patterned case (prefaced by asterisk)
# eg. "*dd:ddam" will catch the pattern "10:30am"
if item[0] == '*':
if re.search(item.replace('*', '').replace('d', '\d'), a_sentence):
return (item)
else:
if item.lower() in a_sentence.lower():
return (item)
# convenience function: split text into one or more sentences
def split(text, rows_split_delim=['.', '!', '?']):
# sentences after the split
rows_split = []
# pointer to position within text
pointer = 0
# don't try to split messages with links
if 'http' in text.lower():
return [text]
# loop through each character in the message text
for char in text:
# if character is a sentence delimeter
if char in rows_split_delim:
# split out the text from the previous pointer to this delimeter
sentence = text[pointer:text.index(char, pointer) + 1]
# remove extra spaces
sentence = sentence.lstrip().strip()
rows_split.append(sentence)
# update the pointer
pointer = text.index(char, pointer) + 1
# finish by splitting out the remaining text
# from the previous pointer to this delimeter
# this handles the case of text with no split sentences
sentence = text[pointer:].lstrip().strip()
rows_split.append(sentence)
return rows_split
# Classify class definition
class Classifier(object):
"""A classification object, for topics defined in a json definiton
Attributes:
topics_file: a json structure containing words/patterns for a list of topics
"""
def __init__(self, topics_file):
# load topics and their words
try:
self.topics = json.load(open(topics_file))
except:
print ('error opening file', topics_file)
def classify(self, text):
topics_data = {}
# split out sentences from the text
sentences = split(text)
for sentence in sentences:
# loop through the topics
for key in self.topics.keys():
# if the sentence contains any of the words for this topic, add to results
if contains(self.topics[key], sentence):
if key not in topics_data:
topics_data[key] = [sentence]
else:
topics_data[key].append(sentence)
return topics_data
Code: Alles auswählen
"Sport": ["Skifahren", "Fußball", "Hängegleiten", "joggen"],
"Musik": ["Jazz", "Rock", "rap", "Classic"]
bei ausprobieren im jupyter notebook:
Code: Alles auswählen
from msgClassify import *
c = Classifier('topics.json')
topics = c.classify("ich gehe gerne hängegleiten")
if len(topics) == 1:
print(topics)
elif len(topics) == 0:
print("Kein passendes Thema gefunden")
else:
print("mehrere Themen gefunden")
print(topics)
kommt folgende meldung:
Code: Alles auswählen
error opening file topics6.json
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-64-843108aa2b3d> in <module>()
3 c = Classifier('topics6.json')
4
----> 5 topics = c.classify("gerät")
6
7 if len(topics) == 1:
G:\10_others\sonstige_2019\msgClassify.py in classify(self, text)
89 for sentence in sentences:
90 # loop through the topics
---> 91 for key in self.topics.keys():
92 # if the sentence contains any of the words for this topic, add to results
93 if contains(self.topics[key], sentence):
AttributeError: 'Classifier' object has no attribute 'topics'
und bei der ausgabe des json-files in jupyter mit diesem code
Code: Alles auswählen
import json
with open('topics.json', encoding="utf-8") as topics:
data = json.load(topics)
print(data)
Code: Alles auswählen
---------------------------------------------------------------------------
JSONDecodeError Traceback (most recent call last)
<ipython-input-66-0375af9e0575> in <module>()
2
3 with open('topics.json', encoding="utf-8") as topics:
----> 4 data = json.load(topics)
5
6 print(data)
297 cls=cls, object_hook=object_hook,
298 parse_float=parse_float, parse_int=parse_int,
--> 299 parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
300
301
\\nzpro001\SOFTWARE32P\Python\A3_6\Anaconda3\lib\json\__init__.py in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
342 if s.startswith('\ufeff'):
343 raise JSONDecodeError("Unexpected UTF-8 BOM (decode using utf-8-sig)",
--> 344 s, 0)
345 else:
346 if not isinstance(s, (bytes, bytearray)):
JSONDecodeError: Unexpected UTF-8 BOM (decode using utf-8-sig): line 1 column 1 (char 0)
Hab übrigens alles in UTF-8 codiert und hat Nüsse gebracht. Was soll ich noch ausprobieren? Ich brauche einen Textfilter der auch ä, ü. ß, ö filtern kann. Vielleicht gibt es noch eine bessere Methode?
Lieben Dank