Package org.ujmp.core.text
Class TextUtil
- java.lang.Object
-
- org.ujmp.core.text.TextUtil
-
public abstract class TextUtil extends java.lang.Object
-
-
Field Summary
Fields Modifier and Type Field Description static java.lang.String
ALL_CAPS_REGEX
static java.lang.String
ALPHA_NUMERIC_REGEX
static java.lang.String
CAPS_MIX_REGEX
static java.lang.String
EMAIL_REGEX
static java.lang.String
END_DASH_REGEX
static java.lang.String
EXCLAMATION_QUESTION_MARK_REGEX
static java.lang.String
EXP_NUMBER_REGEX
static java.lang.String
FLOATING_POINT_NUMBER_REGEX
static java.lang.String
FOUR_CAPS_REGEX
static java.lang.String
FOUR_DIGITS_REGEX
static java.lang.String
HAS_DASH_REGEX
static java.lang.String
HAS_DIGIT_REGEX
static java.lang.String
HEX_REGEX
static java.lang.String
HTML_REGEX
static java.lang.String
IN_PARENTHESES_REGEX
static java.lang.String
INIT_CAPS_ALPHA_REGEX
static java.lang.String
INIT_CAPS_REGEX
static java.lang.String
INIT_DASH_REGEX
static java.lang.String
IP_REGEX
static java.lang.String
MULTIPLE_EXCLAMATION_MARKS_REGEX
static java.lang.String
MULTIPLE_QUESTION_MARKS_REGEX
static java.lang.String
NEGATIVE_INTEGER_REGEX
static java.lang.String
OBD_REGEX
static java.lang.String
ONE_CAP_REGEX
static java.lang.String
ONE_DIGIT_REGEX
static java.lang.String
ONE_EXCLAMATION_MARK_REGEX
static java.lang.String
ONE_QUESTION_MARK_REGEX
static java.lang.String
POSITIVE_INTEGER_REGEX
static java.lang.String
PUNCTUATION_REGEX
static java.lang.String
QUESTION_EXCLAMATION_MARK_REGEX
static java.lang.String
ROMAN_NUMBER_CAPITAL_REGEX
static java.lang.String
ROMAN_NUMBER_SMALL_REGEX
static java.lang.String
SINGLE_INITIAL_REGEX
static java.lang.String
THREE_CAPS_REGEX
static java.lang.String
THREE_DIGITS_REGEX
static java.lang.String
THREE_EXCLAMATION_MARKS_REGEX
static java.lang.String
THREE_QUESTION_MARKS_REGEX
static java.lang.String
TWO_CAPS_REGEX
static java.lang.String
TWO_DIGITS_REGEX
static java.lang.String
TWO_EXCLAMATION_MARKS_REGEX
static java.lang.String
TWO_QUESTION_MARKS_REGEX
static java.lang.String
URL_REGEX
static java.lang.String
YEAR_REGEX
-
Constructor Summary
Constructors Constructor Description TextUtil()
-
Method Summary
All Methods Static Methods Concrete Methods Modifier and Type Method Description static java.util.Collection<TextToken>
convertSentenceToTextTokens(java.lang.String sentence)
static java.util.Collection<TextSentence>
convertToTextBlockToSentences(java.lang.String text)
static Matrix
createBagOfWordsVector(java.lang.String string, java.util.List<java.lang.String> dictionary)
private static boolean[]
createCharacterVector(char... chars)
static java.util.List<java.util.List<java.lang.String>>
createWordBigrams(java.lang.String text)
static java.util.List<java.util.List<java.lang.String>>
createWordBigrams(java.util.List<java.lang.String> words)
static java.util.List<java.util.List<java.lang.String>>
createWordTrigrams(java.lang.String text)
static java.util.List<java.util.List<java.lang.String>>
createWordTrigrams(java.util.List<java.lang.String> words)
static java.util.List<java.lang.String>
createWordUnigrams(java.lang.String text, int ngramSize)
static boolean
endsWithAbbreviation(java.lang.String string)
static Matrix
getCharacterBigramFrequencies(java.lang.String s, char... validCharacters)
static DenseDoubleMatrix2D
getCharacterFrequencies(java.lang.String s, char... validCharacters)
static Matrix
getCharacterTrigramFrequencies(java.lang.String s, char... validCharacters)
static java.util.Map<java.util.List<java.lang.String>,java.lang.Integer>
getWordBigramCounts(java.lang.String text)
static java.util.Map<java.lang.String,java.lang.Integer>
getWordUnigramCounts(java.lang.String text, int ngramSize)
static java.util.List<java.lang.String>
splitLineIntoSentences(java.lang.String line)
static java.util.List<java.lang.String>
splitSentenceIntoTokens(java.lang.String sentence, int ngramSize)
static java.util.List<java.lang.String>
splitTextIntoLines(java.lang.String text)
static DefaultTextBlock
splitTextIntoObjects(java.lang.String text)
static java.util.List<java.lang.String>
splitTextIntoSentences(java.lang.String text)
static Matrix
stringToVector(java.lang.String string)
static Matrix
stringToVector(java.lang.String string, int size)
-
-
-
Field Detail
-
ALPHA_NUMERIC_REGEX
public static final java.lang.String ALPHA_NUMERIC_REGEX
- See Also:
- Constant Field Values
-
HAS_DASH_REGEX
public static final java.lang.String HAS_DASH_REGEX
- See Also:
- Constant Field Values
-
INIT_DASH_REGEX
public static final java.lang.String INIT_DASH_REGEX
- See Also:
- Constant Field Values
-
END_DASH_REGEX
public static final java.lang.String END_DASH_REGEX
- See Also:
- Constant Field Values
-
PUNCTUATION_REGEX
public static final java.lang.String PUNCTUATION_REGEX
- See Also:
- Constant Field Values
-
ONE_QUESTION_MARK_REGEX
public static final java.lang.String ONE_QUESTION_MARK_REGEX
- See Also:
- Constant Field Values
-
TWO_QUESTION_MARKS_REGEX
public static final java.lang.String TWO_QUESTION_MARKS_REGEX
- See Also:
- Constant Field Values
-
THREE_QUESTION_MARKS_REGEX
public static final java.lang.String THREE_QUESTION_MARKS_REGEX
- See Also:
- Constant Field Values
-
MULTIPLE_QUESTION_MARKS_REGEX
public static final java.lang.String MULTIPLE_QUESTION_MARKS_REGEX
- See Also:
- Constant Field Values
-
ONE_EXCLAMATION_MARK_REGEX
public static final java.lang.String ONE_EXCLAMATION_MARK_REGEX
- See Also:
- Constant Field Values
-
TWO_EXCLAMATION_MARKS_REGEX
public static final java.lang.String TWO_EXCLAMATION_MARKS_REGEX
- See Also:
- Constant Field Values
-
THREE_EXCLAMATION_MARKS_REGEX
public static final java.lang.String THREE_EXCLAMATION_MARKS_REGEX
- See Also:
- Constant Field Values
-
MULTIPLE_EXCLAMATION_MARKS_REGEX
public static final java.lang.String MULTIPLE_EXCLAMATION_MARKS_REGEX
- See Also:
- Constant Field Values
-
QUESTION_EXCLAMATION_MARK_REGEX
public static final java.lang.String QUESTION_EXCLAMATION_MARK_REGEX
- See Also:
- Constant Field Values
-
EXCLAMATION_QUESTION_MARK_REGEX
public static final java.lang.String EXCLAMATION_QUESTION_MARK_REGEX
- See Also:
- Constant Field Values
-
INIT_CAPS_REGEX
public static final java.lang.String INIT_CAPS_REGEX
- See Also:
- Constant Field Values
-
INIT_CAPS_ALPHA_REGEX
public static final java.lang.String INIT_CAPS_ALPHA_REGEX
- See Also:
- Constant Field Values
-
ONE_CAP_REGEX
public static final java.lang.String ONE_CAP_REGEX
- See Also:
- Constant Field Values
-
TWO_CAPS_REGEX
public static final java.lang.String TWO_CAPS_REGEX
- See Also:
- Constant Field Values
-
THREE_CAPS_REGEX
public static final java.lang.String THREE_CAPS_REGEX
- See Also:
- Constant Field Values
-
FOUR_CAPS_REGEX
public static final java.lang.String FOUR_CAPS_REGEX
- See Also:
- Constant Field Values
-
ALL_CAPS_REGEX
public static final java.lang.String ALL_CAPS_REGEX
- See Also:
- Constant Field Values
-
CAPS_MIX_REGEX
public static final java.lang.String CAPS_MIX_REGEX
- See Also:
- Constant Field Values
-
ONE_DIGIT_REGEX
public static final java.lang.String ONE_DIGIT_REGEX
- See Also:
- Constant Field Values
-
TWO_DIGITS_REGEX
public static final java.lang.String TWO_DIGITS_REGEX
- See Also:
- Constant Field Values
-
THREE_DIGITS_REGEX
public static final java.lang.String THREE_DIGITS_REGEX
- See Also:
- Constant Field Values
-
FOUR_DIGITS_REGEX
public static final java.lang.String FOUR_DIGITS_REGEX
- See Also:
- Constant Field Values
-
HAS_DIGIT_REGEX
public static final java.lang.String HAS_DIGIT_REGEX
- See Also:
- Constant Field Values
-
POSITIVE_INTEGER_REGEX
public static final java.lang.String POSITIVE_INTEGER_REGEX
- See Also:
- Constant Field Values
-
NEGATIVE_INTEGER_REGEX
public static final java.lang.String NEGATIVE_INTEGER_REGEX
- See Also:
- Constant Field Values
-
FLOATING_POINT_NUMBER_REGEX
public static final java.lang.String FLOATING_POINT_NUMBER_REGEX
- See Also:
- Constant Field Values
-
EXP_NUMBER_REGEX
public static final java.lang.String EXP_NUMBER_REGEX
- See Also:
- Constant Field Values
-
ROMAN_NUMBER_SMALL_REGEX
public static final java.lang.String ROMAN_NUMBER_SMALL_REGEX
- See Also:
- Constant Field Values
-
ROMAN_NUMBER_CAPITAL_REGEX
public static final java.lang.String ROMAN_NUMBER_CAPITAL_REGEX
- See Also:
- Constant Field Values
-
SINGLE_INITIAL_REGEX
public static final java.lang.String SINGLE_INITIAL_REGEX
- See Also:
- Constant Field Values
-
IN_PARENTHESES_REGEX
public static final java.lang.String IN_PARENTHESES_REGEX
- See Also:
- Constant Field Values
-
OBD_REGEX
public static final java.lang.String OBD_REGEX
- See Also:
- Constant Field Values
-
YEAR_REGEX
public static final java.lang.String YEAR_REGEX
- See Also:
- Constant Field Values
-
HEX_REGEX
public static final java.lang.String HEX_REGEX
- See Also:
- Constant Field Values
-
EMAIL_REGEX
public static final java.lang.String EMAIL_REGEX
- See Also:
- Constant Field Values
-
IP_REGEX
public static final java.lang.String IP_REGEX
- See Also:
- Constant Field Values
-
HTML_REGEX
public static final java.lang.String HTML_REGEX
- See Also:
- Constant Field Values
-
URL_REGEX
public static final java.lang.String URL_REGEX
- See Also:
- Constant Field Values
-
-
Method Detail
-
getCharacterFrequencies
public static final DenseDoubleMatrix2D getCharacterFrequencies(java.lang.String s, char... validCharacters)
-
splitLineIntoSentences
public static final java.util.List<java.lang.String> splitLineIntoSentences(java.lang.String line)
-
splitTextIntoObjects
public static final DefaultTextBlock splitTextIntoObjects(java.lang.String text)
-
createWordTrigrams
public static final java.util.List<java.util.List<java.lang.String>> createWordTrigrams(java.lang.String text)
-
splitTextIntoLines
public static final java.util.List<java.lang.String> splitTextIntoLines(java.lang.String text)
-
createWordBigrams
public static final java.util.List<java.util.List<java.lang.String>> createWordBigrams(java.lang.String text)
-
createWordUnigrams
public static final java.util.List<java.lang.String> createWordUnigrams(java.lang.String text, int ngramSize)
-
getWordBigramCounts
public static java.util.Map<java.util.List<java.lang.String>,java.lang.Integer> getWordBigramCounts(java.lang.String text)
-
getWordUnigramCounts
public static java.util.Map<java.lang.String,java.lang.Integer> getWordUnigramCounts(java.lang.String text, int ngramSize)
-
splitSentenceIntoTokens
public static final java.util.List<java.lang.String> splitSentenceIntoTokens(java.lang.String sentence, int ngramSize)
-
createWordBigrams
public static final java.util.List<java.util.List<java.lang.String>> createWordBigrams(java.util.List<java.lang.String> words)
-
createWordTrigrams
public static final java.util.List<java.util.List<java.lang.String>> createWordTrigrams(java.util.List<java.lang.String> words)
-
endsWithAbbreviation
public static boolean endsWithAbbreviation(java.lang.String string)
-
createCharacterVector
private static final boolean[] createCharacterVector(char... chars)
-
getCharacterBigramFrequencies
public static final Matrix getCharacterBigramFrequencies(java.lang.String s, char... validCharacters)
-
getCharacterTrigramFrequencies
public static final Matrix getCharacterTrigramFrequencies(java.lang.String s, char... validCharacters)
-
createBagOfWordsVector
public static final Matrix createBagOfWordsVector(java.lang.String string, java.util.List<java.lang.String> dictionary)
-
convertSentenceToTextTokens
public static java.util.Collection<TextToken> convertSentenceToTextTokens(java.lang.String sentence)
-
convertToTextBlockToSentences
public static java.util.Collection<TextSentence> convertToTextBlockToSentences(java.lang.String text)
-
splitTextIntoSentences
public static java.util.List<java.lang.String> splitTextIntoSentences(java.lang.String text)
-
stringToVector
public static Matrix stringToVector(java.lang.String string)
-
stringToVector
public static Matrix stringToVector(java.lang.String string, int size)
-
-