@Article{Raff2016,
author="Raff, Edward
and Zak, Richard
and Cox, Russell
and Sylvester, Jared
and Yacci, Paul
and Ward, Rebecca
and Tracy, Anna
and McLean, Mark
and Nicholas, Charles",
title="An investigation of byte n-gram features for malware classification",
journal="Journal of Computer Virology and Hacking Techniques",
year="2016",
pages="1--20",
abstract="Malware classification using machine learning algorithms is a difficult task, in part due to the absence of strong natural features in raw executable binary files. Byte n-grams previously have been used as features, but little work has been done to explain their performance or to understand what concepts are actually being learned. In contrast to other work using n-gram features, in this work we use orders of magnitude more data, and we perform feature selection during model building using Elastic-Net regularized Logistic Regression. We compute a regularization path and analyze novel multi-byte identifiers. Through this process, we discover significant previously unreported issues with byte n-gram features that cause their benefits and practicality to be overestimated. Three primary issues emerged from our work. First, we discovered a flaw in how previous corpora were created that leads to an over-estimation of classification accuracy. Second, we discovered that most of the information contained in n-grams stem from string features that could be obtained in simpler ways. Finally, we demonstrate that n-gram features promote overfitting, even with linear models and extreme regularization.",
issn="2263-8733",
doi="10.1007/s11416-016-0283-1",
url="http://dx.doi.org/10.1007/s11416-016-0283-1"
}

