prepare.Rd
Preprocess the document, note that this replaces the object in place.
prepare(text, ...) # S3 method for document prepare(text, remove_corrupt_utf8 = TRUE, remove_case = TRUE, strip_stopwords = TRUE, strip_numbers = TRUE, strip_html_tags = TRUE, strip_punctuation = TRUE, remove_words = NULL, strip_non_letters = FALSE, strip_sparse_terms = FALSE, strip_frequent_terms = FALSE, strip_articles = FALSE, strip_indefinite_articles = FALSE, strip_definite_articles = FALSE, strip_preposition = FALSE, strip_pronouns = FALSE, ...) # S3 method for documents prepare(text, remove_corrupt_utf8 = TRUE, remove_case = TRUE, strip_stopwords = TRUE, strip_numbers = TRUE, strip_html_tags = TRUE, strip_punctuation = TRUE, remove_words = NULL, strip_non_letters = FALSE, strip_sparse_terms = FALSE, strip_frequent_terms = FALSE, strip_articles = FALSE, strip_indefinite_articles = FALSE, strip_definite_articles = FALSE, strip_preposition = FALSE, strip_pronouns = FALSE, ...) # S3 method for corpus prepare(text, remove_corrupt_utf8 = TRUE, remove_case = TRUE, strip_stopwords = TRUE, strip_numbers = TRUE, strip_html_tags = TRUE, strip_punctuation = TRUE, remove_words = NULL, strip_non_letters = FALSE, strip_sparse_terms = FALSE, strip_frequent_terms = FALSE, strip_articles = FALSE, strip_indefinite_articles = FALSE, strip_definite_articles = FALSE, strip_preposition = FALSE, strip_pronouns = FALSE, ..., update_lexicon = TRUE, update_inverse_index = TRUE)
text | An object inheriting of class |
---|---|
... | Other special classes |
remove_corrupt_utf8 | Remove corrupt UTF8 characters. |
remove_case | Convert to lowercase. |
strip_stopwords | Remove stopwords, i.e.: "all", "almost", "alone". |
strip_numbers | Remove numbers. |
strip_html_tags | Remove html tags, including the style and script tags. |
strip_punctuation | Remove punctuation. |
remove_words | Remove the occurences of words from `doc`. |
strip_non_letters | Remove anything non-numeric. |
strip_sparse_terms | Remove sparse terms. |
strip_frequent_terms | Remove frequent terms. |
strip_articles | Remove articles: "a", "an", "the". |
strip_indefinite_articles | Removes indefinite articles: "a", "an". |
strip_definite_articles | Remove "the". |
strip_preposition | Remove preprositions, i.e.: "across", "around", "before". |
strip_pronouns | Remove pronounces, i.e.: "I", "you", "he", "she". |
update_lexicon | Whether to update the lexicon of the corpus,
see |
update_inverse_index | Whether to update the inverse index of the corpus,
see |
stem_words
to stem your document.
# NOT RUN { init_textanalysis() # build document doc <- string_document("This <span>is</span> a very short document!.!") # replaces in place! prepare(doc) get_text(doc) # }