Package org.ujmp.core.text
Class TextUtil
java.lang.Object
org.ujmp.core.text.TextUtil
-
Field Summary
FieldsModifier and TypeFieldDescriptionstatic final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
static final String
-
Constructor Summary
Constructors -
Method Summary
Modifier and TypeMethodDescriptionstatic Collection<TextToken>
convertSentenceToTextTokens
(String sentence) static Collection<TextSentence>
static final Matrix
createBagOfWordsVector
(String string, List<String> dictionary) private static final boolean[]
createCharacterVector
(char... chars) createWordBigrams
(String text) createWordBigrams
(List<String> words) createWordTrigrams
(String text) createWordTrigrams
(List<String> words) createWordUnigrams
(String text, int ngramSize) static boolean
endsWithAbbreviation
(String string) static final Matrix
getCharacterBigramFrequencies
(String s, char... validCharacters) static final DenseDoubleMatrix2D
getCharacterFrequencies
(String s, char... validCharacters) static final Matrix
getCharacterTrigramFrequencies
(String s, char... validCharacters) getWordBigramCounts
(String text) getWordUnigramCounts
(String text, int ngramSize) splitLineIntoSentences
(String line) splitSentenceIntoTokens
(String sentence, int ngramSize) splitTextIntoLines
(String text) static final DefaultTextBlock
splitTextIntoObjects
(String text) splitTextIntoSentences
(String text) static Matrix
stringToVector
(String string) static Matrix
stringToVector
(String string, int size)
-
Field Details
-
ALPHA_NUMERIC_REGEX
- See Also:
-
HAS_DASH_REGEX
- See Also:
-
INIT_DASH_REGEX
- See Also:
-
END_DASH_REGEX
- See Also:
-
PUNCTUATION_REGEX
- See Also:
-
ONE_QUESTION_MARK_REGEX
- See Also:
-
TWO_QUESTION_MARKS_REGEX
- See Also:
-
THREE_QUESTION_MARKS_REGEX
- See Also:
-
MULTIPLE_QUESTION_MARKS_REGEX
- See Also:
-
ONE_EXCLAMATION_MARK_REGEX
- See Also:
-
TWO_EXCLAMATION_MARKS_REGEX
- See Also:
-
THREE_EXCLAMATION_MARKS_REGEX
- See Also:
-
MULTIPLE_EXCLAMATION_MARKS_REGEX
- See Also:
-
QUESTION_EXCLAMATION_MARK_REGEX
- See Also:
-
EXCLAMATION_QUESTION_MARK_REGEX
- See Also:
-
INIT_CAPS_REGEX
- See Also:
-
INIT_CAPS_ALPHA_REGEX
- See Also:
-
ONE_CAP_REGEX
- See Also:
-
TWO_CAPS_REGEX
- See Also:
-
THREE_CAPS_REGEX
- See Also:
-
FOUR_CAPS_REGEX
- See Also:
-
ALL_CAPS_REGEX
- See Also:
-
CAPS_MIX_REGEX
- See Also:
-
ONE_DIGIT_REGEX
- See Also:
-
TWO_DIGITS_REGEX
- See Also:
-
THREE_DIGITS_REGEX
- See Also:
-
FOUR_DIGITS_REGEX
- See Also:
-
HAS_DIGIT_REGEX
- See Also:
-
POSITIVE_INTEGER_REGEX
- See Also:
-
NEGATIVE_INTEGER_REGEX
- See Also:
-
FLOATING_POINT_NUMBER_REGEX
- See Also:
-
EXP_NUMBER_REGEX
- See Also:
-
ROMAN_NUMBER_SMALL_REGEX
- See Also:
-
ROMAN_NUMBER_CAPITAL_REGEX
- See Also:
-
SINGLE_INITIAL_REGEX
- See Also:
-
IN_PARENTHESES_REGEX
- See Also:
-
OBD_REGEX
- See Also:
-
YEAR_REGEX
- See Also:
-
HEX_REGEX
- See Also:
-
EMAIL_REGEX
- See Also:
-
IP_REGEX
- See Also:
-
HTML_REGEX
- See Also:
-
URL_REGEX
- See Also:
-
-
Constructor Details
-
TextUtil
public TextUtil()
-
-
Method Details
-
getCharacterFrequencies
-
splitLineIntoSentences
-
splitTextIntoObjects
-
createWordTrigrams
-
splitTextIntoLines
-
createWordBigrams
-
createWordUnigrams
-
getWordBigramCounts
-
getWordUnigramCounts
-
splitSentenceIntoTokens
-
createWordBigrams
-
createWordTrigrams
-
endsWithAbbreviation
-
createCharacterVector
private static final boolean[] createCharacterVector(char... chars) -
getCharacterBigramFrequencies
-
getCharacterTrigramFrequencies
-
createBagOfWordsVector
-
convertSentenceToTextTokens
-
convertToTextBlockToSentences
-
splitTextIntoSentences
-
stringToVector
-
stringToVector
-