alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (DefaultStreamTokenizer.java)

This example Java source code file (DefaultStreamTokenizer.java) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

atomicinteger, bufferedreader, defaultstreamtokenizer, inputstreamreader, ioexception, list, override, reader, runtimeexception, starting, streamtokenizer, string, stringbuffer, tokens, util

The DefaultStreamTokenizer.java Java example source code

/*
 *
 *  * Copyright 2015 Skymind,Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 *
 */

package org.deeplearning4j.text.tokenization.tokenizer;


import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * Tokenizer based on the {@link java.io.StreamTokenizer}
 * @author Adam Gibson
 *
 */
public class DefaultStreamTokenizer implements Tokenizer {

    private StreamTokenizer streamTokenizer;
    private TokenPreProcess tokenPreProcess;
    private List<String> tokens = new ArrayList<>();
    private AtomicInteger position = new AtomicInteger(0);

    protected static final Logger log = LoggerFactory.getLogger(DefaultStreamTokenizer.class);

    public DefaultStreamTokenizer(InputStream is) {
        Reader r = new BufferedReader(new InputStreamReader(is));
        streamTokenizer = new StreamTokenizer(r);

    }

    /**
     * Checks, if underlying stream has any tokens left
     *
     * @return
     */
    private boolean streamHasMoreTokens() {
        if(streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
            try {
                streamTokenizer.nextToken();
            } catch (IOException e1) {
                throw new RuntimeException(e1);
            }
        }
        return streamTokenizer.ttype != StreamTokenizer.TT_EOF && streamTokenizer.ttype != -1;
    }

    /**
     * Checks, if any prebuffered tokens left, otherswise checks underlying stream
     * @return
     */
    @Override
    public boolean hasMoreTokens() {
        log.info("Tokens size: [" + tokens.size()+"], position: ["+ position.get()+"]");
        if (tokens.size() > 0) return position.get() < tokens.size();
            else return streamHasMoreTokens();
    }

    /**
     * Returns number of tokens
     * PLEASE NOTE: this method effectively preloads all tokens. So use it with caution, since on large streams it will consume big amount of memory
     *
     * @return
     */
    @Override
    public int countTokens() {
        return getTokens().size();
    }


    /**
     * This method returns next token from prebuffered list of tokens or underlying InputStream
     *
     * @return next token as String
     */
    @Override
    public String nextToken() {
        if (tokens.size() > 0 && position.get() < tokens.size()) return tokens.get(position.getAndIncrement());
        return nextTokenFromStream();
    }

    /**
     * This method returns next token from underlying InputStream
     *
     * @return
     */
    private String nextTokenFromStream() {
        StringBuffer sb = new StringBuffer();


        if(streamTokenizer.ttype == StreamTokenizer.TT_WORD) {
            sb.append(streamTokenizer.sval);
        } else if(streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
            sb.append(streamTokenizer.nval);
        } else if(streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
            try {
                while(streamTokenizer.ttype == StreamTokenizer.TT_EOL)
                    streamTokenizer.nextToken();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        } else if(streamHasMoreTokens())
            return nextTokenFromStream();


        String ret =  sb.toString();

        if(tokenPreProcess != null)
            ret = tokenPreProcess.preProcess(ret);
        return ret;

    }

    /**
     * Returns all tokens as list of Strings
     *
     * @return List of tokens
     */
    @Override
    public List<String> getTokens() {
        //List<String> tokens = new ArrayList<>();
        if (tokens.size() > 0) return tokens;

        log.info("Starting prebuffering...");
        while(streamHasMoreTokens()) {
            tokens.add(nextTokenFromStream());
        }
        log.info("Tokens prefetch finished. Tokens size: [" + tokens.size()+"]");
        return tokens;
    }

    @Override
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
        this.tokenPreProcess = tokenPreProcessor;
    }

}

Other Java examples (source code examples)

Here is a short list of links related to this Java DefaultStreamTokenizer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.