|
Java example source code file (InputHomogenization.java)
The InputHomogenization.java Java example source code/* * * * Copyright 2015 Skymind,Inc. * * * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * */ package org.deeplearning4j.text.inputsanitation; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.List; /** * Performs some very basic textual transformations * such as word shape, lower casing, and stripping of punctuation * @author Adam Gibson * */ public class InputHomogenization { private String input; private List<String> ignoreCharactersContaining; private boolean preserveCase; /** * Input text to applyTransformToOrigin * @param input the input text to applyTransformToOrigin, * equivalent to calling this(input,false) * wrt preserving case */ public InputHomogenization(String input) { this(input,false); } /** * * @param input the input to applyTransformToOrigin * @param preserveCase whether to preserve case */ public InputHomogenization(String input,boolean preserveCase) { this.input = input; this.preserveCase = preserveCase; } /** * * @param input the input to applyTransformToOrigin * @param ignoreCharactersContaining ignore transformation of words * containigng specified strings */ public InputHomogenization(String input,List<String> ignoreCharactersContaining) { this.input = input; this.ignoreCharactersContaining = ignoreCharactersContaining; } /** * Returns the normalized text passed in via constructor * @return the normalized text passed in via constructor */ public String transform() { StringBuffer sb = new StringBuffer(); for(int i = 0; i < input.length(); i++) { if(ignoreCharactersContaining != null && ignoreCharactersContaining.contains(String.valueOf(input.charAt(i)))) sb.append(input.charAt(i)); else if(Character.isDigit(input.charAt(i))) sb.append("d"); else if(Character.isUpperCase(input.charAt(i)) && !preserveCase) sb.append(Character.toLowerCase(input.charAt(i))); else sb.append(input.charAt(i)); } String normalized = Normalizer.normalize(sb.toString(),Form.NFD); normalized = normalized.replace(".",""); normalized = normalized.replace(",",""); normalized = normalized.replaceAll("\"",""); normalized = normalized.replace("'",""); normalized = normalized.replace("(",""); normalized = normalized.replace(")",""); normalized = normalized.replace("“",""); normalized = normalized.replace("”",""); normalized = normalized.replace("…",""); normalized = normalized.replace("|",""); normalized = normalized.replace("/",""); normalized = normalized.replace("\\", ""); normalized = normalized.replace("[", ""); normalized = normalized.replace("]", ""); normalized = normalized.replace("‘",""); normalized = normalized.replace("’",""); normalized = normalized.replaceAll("[!]+","!"); return normalized; } } Other Java examples (source code examples)Here is a short list of links related to this Java InputHomogenization.java source code file: |
... this post is sponsored by my books ... | |
#1 New Release! |
FP Best Seller |
Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.
A percentage of advertising revenue from
pages under the /java/jwarehouse
URI on this website is
paid back to open source projects.