<?xml version="1.0" encoding="UTF-8"?>
<article article-type="research-article" dtd-version="1.3" xml:lang="ru" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://metafora.rcsi.science/xsd_files/journal3.xsd">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">moitvivt</journal-id>
      <journal-title-group>
        <journal-title xml:lang="ru">Моделирование, оптимизация и информационные технологии</journal-title>
        <trans-title-group xml:lang="en">
          <trans-title>Modeling, Optimization and Information Technology</trans-title>
        </trans-title-group>
      </journal-title-group>
      <issn pub-type="epub">2310-6018</issn>
      <publisher>
        <publisher-name>Издательство</publisher-name>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="doi">10.26102/2310-6018/2021.32.1.004</article-id>
      <article-id pub-id-type="custom" custom-type="elpub">929</article-id>
      <title-group>
        <article-title xml:lang="ru">Метод распознавания эмоций человека по двигательной активности тела в видеопотоке на основе нейронных сетей</article-title>
        <trans-title-group xml:lang="en">
          <trans-title>Method of human emotion recognition through analysis of body motor activity in a video stream using neural networks.</trans-title>
        </trans-title-group>
      </title-group>
      <contrib-group>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0002-7032-0291</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Уздяев</surname>
              <given-names>Михаил Юрьевич</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Uzdiaev</surname>
              <given-names>Mikhail Yurievich</given-names>
            </name>
          </name-alternatives>
          <email>m.y.uzdiaev@gmail.com</email>
          <xref ref-type="aff">aff-1</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <contrib-id contrib-id-type="orcid">0000-0002-9509-178X</contrib-id>
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Дударенко</surname>
              <given-names>Дмитрий Михайлович</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Dudarenko</surname>
              <given-names>Dmitry Mikhailovich</given-names>
            </name>
          </name-alternatives>
          <email>dmitry@dudarenko.net</email>
          <xref ref-type="aff">aff-2</xref>
        </contrib>
        <contrib contrib-type="author" corresp="yes">
          <name-alternatives>
            <name name-style="eastern" xml:lang="ru">
              <surname>Миронов</surname>
              <given-names>Виктор Николаевич</given-names>
            </name>
            <name name-style="western" xml:lang="en">
              <surname>Mironov</surname>
              <given-names>Viktor Nikolaevich</given-names>
            </name>
          </name-alternatives>
          <email>vmn20@mail.ru</email>
          <xref ref-type="aff">aff-3</xref>
        </contrib>
      </contrib-group>
      <aff-alternatives id="aff-1">
        <aff xml:lang="ru">Федеральное государственное бюджетное учреждение науки «Санкт-Петербургский Федеральный исследовательский центр Российской академии наук» (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</aff>
        <aff xml:lang="en">St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</aff>
      </aff-alternatives>
      <aff-alternatives id="aff-2">
        <aff xml:lang="ru">Федеральное государственное бюджетное учреждение науки «Санкт-Петербургский Федеральный исследовательский центр Российской академии наук» (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</aff>
        <aff xml:lang="en">St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</aff>
      </aff-alternatives>
      <aff-alternatives id="aff-3">
        <aff xml:lang="ru">Федеральное государственное бюджетное учреждение науки «Санкт-Петербургский Федеральный исследовательский центр Российской академии наук» (СПб ФИЦ РАН), Санкт-Петербургский институт информатики и автоматизации Российской академии наук</aff>
        <aff xml:lang="en">St. Petersburg Federal Research Center of the Russian Academy of Sciences (SPC RAS), St. Petersburg Institute for Informatics and Automation of the Russian Academy of Sciences</aff>
      </aff-alternatives>
      <pub-date pub-type="epub">
        <day>01</day>
        <month>01</month>
        <year>2026</year>
      </pub-date>
      <volume>1</volume>
      <issue>1</issue>
      <elocation-id>10.26102/2310-6018/2021.32.1.004</elocation-id>
      <permissions>
        <copyright-statement>Copyright © Авторы, 2026</copyright-statement>
        <copyright-year>2026</copyright-year>
        <license license-type="creative-commons-attribution" xlink:href="https://creativecommons.org/licenses/by/4.0/">
          <license-p>This work is licensed under a Creative Commons Attribution 4.0 International License</license-p>
        </license>
      </permissions>
      <self-uri xlink:href="https://moitvivt.ru/ru/journal/article?id=929"/>
      <abstract xml:lang="ru">
        <p>В данной статье рассматривается применение различных нейросетевых моделей для решения задачи распознавания эмоций человека по двигательной активности его тела на кадрах видеопотока без сложной предварительной обработки этих кадров. В работе представлены трехмерные сверточные нейронные сети: Inception 3D (I3D), Residual 3D (R3D), а также сверточно-рекуррентные нейросетевые архитектуры, использующие сверточную нейронную сеть архитектуры ResNet и рекуррентные нейросети архитектур LSTM и GRU (ResNet+LSTM, ResNet+GRU), которые не требуют предварительной обработки изображений или видеопотока и при этом потенциально позволяют достичь высокой точности распознавания эмоций. На основе рассмотренных архитектур предложен метод распознавания эмоций человека по двигательной активности тела в видеопотоке. Обсуждаются архитектурные особенности используемых моделей, способы обработки моделями кадров видеопотока, а также результаты распознавания эмоций по следующим метрикам качества: доля верно распознанных экземпляров (accuracy), точность (precision), полнота (recall). Результаты апробации предложенных в работе нейросетевых моделей I3D, R3D, ResNet+LSTM, ResNet+GRU на наборе данных FABO показали высокое качество распознавания эмоций по двигательной активности тела человека. Так, модель R3D показала лучшую долю верно распознанных экземпляров, равную 91 %. Другие предложенные модели: I3D, ResNet+LSTM, ResNet+GRU – показали точность распознавания 88 %, 80 % и 80 % соответственно. Таким образом, согласно полученным результатам экспериментальной оценки предложенных нейросетевых моделей, наиболее предпочтительными для использования при решении задачи распознавания эмоционального состояния человека по двигательной активности, с точки зрения совокупности показателей точности классификации эмоций, являются трехмерные сверточные модели I3D и R3D. При этом, предложенные модели, в отличие от большинства существующих решений, позволяют реализовывать распознавание эмоций на основе анализа RGB кадров видеопотока без выполнения их предварительной ресурсозатратной обработки, а также с высокой точностью выполнять распознавание эмоций в реальном масштабе времени.</p>
      </abstract>
      <trans-abstract xml:lang="en">
        <p>This paper presents the use of various neural network models to solve the problem of human emotion recognition by the motor activity of his body on frames of a video stream without complex preprocessing of these frames. The paper presents three-dimensional convolutional neural networks: Inception 3D (I3D), Residual 3D (R3D), as well as convolutional-recurrent neural network architectures using the convolutional neural network of the ResNet architecture and recurrent neural networks of the LSTM and GRU architectures (ResNet + LSTM, ResNet + GRU) which do not require preliminary processing of images or video stream and at the same time potentially allow achieving high accuracy of emotion recognition. Based on the considered architectures, a method for human emotion recognition from the motor activity of the body in a video stream is proposed. Architectural features of the used models, methods of processing video stream frames by models, as well as the results of emotion recognition according to the following quality metrics: the proportion of correctly recognized instances (accuracy), precision, recall are discussed. Approbation results of the proposed neural network models I3D, R3D, ResNet + LSTM, ResNet + GRU on the FABO data set showed a high quality of emotion recognition based on the motor activity of the human body. Thus, the R3D model showed the best share of correctly recognized copies, equal to 91%. Other proposed models: I3D, ResNet + LSTM, ResNet + GRU showed 88%, 80% and 80% recognition accuracy, respectively. Therefore, according to the obtained results of the experimental evaluation of the proposed neural network models, the most preferable for use in solving the problem of a person's emotional state recognition by motor activity, from the point of view of a set of indicators of the accuracy of emotion classification, are three-dimensional convolutional models I3D and R3D. At the same time, the proposed models, in contrast to most existing solutions, make it possible to implement emotion recognition based on the analysis of RGB frames of a video stream without performing their preliminary resource-consuming processing, as well as to perform emotion recognition in real-time with high accuracy.</p>
      </trans-abstract>
      <kwd-group xml:lang="ru">
        <kwd>нейросетевая модель</kwd>
        <kwd>распознавание эмоций</kwd>
        <kwd>сверточные нейронные сети</kwd>
        <kwd>машинное обучение</kwd>
        <kwd>обработка изображений</kwd>
        <kwd>видеопоток</kwd>
      </kwd-group>
      <kwd-group xml:lang="en">
        <kwd>neural network model</kwd>
        <kwd>emotion recognition</kwd>
        <kwd>convolutional neural networks</kwd>
        <kwd>machine learning</kwd>
        <kwd>image processing</kwd>
        <kwd>video stream</kwd>
      </kwd-group>
      <funding-group>
        <funding-statement xml:lang="ru">Исследование выполнено без спонсорской поддержки.</funding-statement>
        <funding-statement xml:lang="en">The study was performed without external funding.</funding-statement>
      </funding-group>
    </article-meta>
  </front>
  <back>
    <ref-list>
      <title>References</title>
      <ref id="cit1">
        <label>1</label>
        <mixed-citation xml:lang="ru">Ватаманюк И.В., Яковлев Р.Н. Алгоритмическая модель распределенной системы корпоративного информирования в рамках киберфизической системы организации. Моделирование, оптимизация и информационные технологии. 2019;7(4). Доступно по: https://moit.vivt.ru/wp-content/uploads/2019/11/VatamanukSoavtori_4_19_1.pdf. DOI: 10.26102/2310-6018/2019.27.4.026 (дата обращения: 20.10.2020).</mixed-citation>
      </ref>
      <ref id="cit2">
        <label>2</label>
        <mixed-citation xml:lang="ru">Letenkov M., Levonevskiy D. Fast Face Features Extraction Based on Deep Neural Networks for Mobile Robotic Platforms. International Conference on Interactive Collaborative Robotics. Springer, Cham. 2020:200-211. DOI: 10.1007/978-3-030-60337-3_20.</mixed-citation>
      </ref>
      <ref id="cit3">
        <label>3</label>
        <mixed-citation xml:lang="ru">Ватаманюк И.В., Яковлев Р.Н. Обобщенные теоретические модели киберфизических систем. Известия Юго-Западного государственного университета. 2019;23(6):161-175. Доступно по: https://science.swsu.ru/jour/article/view/666/489. DOI: 10.21869/2223-1560-2019-23-6-161-175 (дата обращения: 20.10.2020).</mixed-citation>
      </ref>
      <ref id="cit4">
        <label>4</label>
        <mixed-citation xml:lang="ru">Frijda N.H. Emotions and action. Feelings and emotions: The Amsterdam symposium. 2004:158-173.</mixed-citation>
      </ref>
      <ref id="cit5">
        <label>5</label>
        <mixed-citation xml:lang="ru">He G., Liu X., Fan F., You J. Image2Audio: Facilitating Semi-supervised Audio Emotion Recognition with Facial Expression Image. Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops. 2020:912-913.</mixed-citation>
      </ref>
      <ref id="cit6">
        <label>6</label>
        <mixed-citation xml:lang="ru">Kalsum T., Anwar S.M., Majid M., Khan B., Ali S.M. Emotion recognition from facial expressions using hybrid feature descriptors. IET Image Processing. 2018;12(6):1004-1012.</mixed-citation>
      </ref>
      <ref id="cit7">
        <label>7</label>
        <mixed-citation xml:lang="ru">Levonevskii D., Shumskaya O., Velichko A., Uzdiaev M., Malov D. Methods for Determination of Psychophysiological Condition of User Within Smart Environment Based on Complex Analysis of Heterogeneous Data. Proceedings of 14th International Conference on Electromechanics and Robotics «Zavalishin's Readings». Springer, Singapore. 2020:511-523.</mixed-citation>
      </ref>
      <ref id="cit8">
        <label>8</label>
        <mixed-citation xml:lang="ru">Уздяев М.Ю., Левоневский Д.К., Шумская О.О., Летенков М.А. Методы детектирования агрессивных пользователей информационного пространства на основе генеративно-состязательных нейронных сетей. Информационно-измерительные и управляющие системы. 2019;17(5):60-68.</mixed-citation>
      </ref>
      <ref id="cit9">
        <label>9</label>
        <mixed-citation xml:lang="ru">Uzdiaev M. Methods of Multimodal Data Fusion and Forming Latent Representation in the Human Aggression Recognition Task. 2020 IEEE 10th International Conference on Intelligent Systems (IS). IEEE. 2020:399-403.</mixed-citation>
      </ref>
      <ref id="cit10">
        <label>10</label>
        <mixed-citation xml:lang="ru">Thakur N., Han C.Y. A complex activity based emotion recognition algorithm for affect aware systems. 2018 IEEE 8th Annual Computing and Communication Workshop and Conference (CCWC). IEEE. 2018:748-753.</mixed-citation>
      </ref>
      <ref id="cit11">
        <label>11</label>
        <mixed-citation xml:lang="ru">Wu J., Zhang Y., Ning L. The Fusion Knowledge of Face, Body and Context for Emotion Recognition. 2019 IEEE International Conference on Multimedia &amp; Expo Workshops (ICMEW). IEEE. 2019:108-113.</mixed-citation>
      </ref>
      <ref id="cit12">
        <label>12</label>
        <mixed-citation xml:lang="ru">Piana S., Staglianò A., Odone F., Camurri A. Adaptive body gesture representation for automatic emotion recognition. ACM Transactions on Interactive Intelligent Systems (TiiS). 2016;6(1):1-31.</mixed-citation>
      </ref>
      <ref id="cit13">
        <label>13</label>
        <mixed-citation xml:lang="ru">Ly S.T., Lee G.S., Kim S.H., Yang H.J. Emotion Recognition via Body Gesture: Deep Learning Model Coupled with Keyframe Selection. Proceedings of the 2018 International Conference on Machine Learning and Machine Intelligence. 2018:27-31.</mixed-citation>
      </ref>
      <ref id="cit14">
        <label>14</label>
        <mixed-citation xml:lang="ru">Shen Z., Cheng J., Hu X., Dong Q. Emotion Recognition Based on Multi-View Body Gestures. 2019 IEEE International Conference on Image Processing (ICIP). IEEE, 2019:3317-3321.</mixed-citation>
      </ref>
      <ref id="cit15">
        <label>15</label>
        <mixed-citation xml:lang="ru">Targ S., Almeida D., Lyman K. Resnet in resnet: Generalizing residual architectures. arXiv preprint arXiv:1603.08029. 2016.</mixed-citation>
      </ref>
      <ref id="cit16">
        <label>16</label>
        <mixed-citation xml:lang="ru">Carreira J., Zisserman A. Quo vadis, action recognition? a new model and the kinetics dataset. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2017:6299-6308.</mixed-citation>
      </ref>
      <ref id="cit17">
        <label>17</label>
        <mixed-citation xml:lang="ru">Hara K., Kataoka H., Satoh Y. Learning spatio-temporal features with 3D residual networks for action recognition. Proceedings of the IEEE International Conference on Computer Vision Workshops. 2017:3154-3160.</mixed-citation>
      </ref>
      <ref id="cit18">
        <label>18</label>
        <mixed-citation xml:lang="ru">Deng J., Dong W., Socher R., Li L. J., Li K., Fei-Fei L. Imagenet: A large-scale hierarchical image database. 2009 IEEE conference on computer vision and pattern recognition. IEEE. 2009:248-255.</mixed-citation>
      </ref>
      <ref id="cit19">
        <label>19</label>
        <mixed-citation xml:lang="ru">Vinyals O., Toshev A., Bengio S., Erhan D. Show and tell: A neural image caption generator. Proceedings of the IEEE conference on computer vision and pattern recognition. 2015:3156-3164.</mixed-citation>
      </ref>
      <ref id="cit20">
        <label>20</label>
        <mixed-citation xml:lang="ru">Xu K., Ba J., Kiros R., Cho K., Courville A., Salakhudinov R., Bengio Y. Show, attend and tell: Neural image caption generation with visual attention. International conference on machine learning. 2015:2048-2057.</mixed-citation>
      </ref>
      <ref id="cit21">
        <label>21</label>
        <mixed-citation xml:lang="ru">Yao L., Torabi A., Cho K., Ballas N., Pal C., Larochelle H., Courville A. Describing videos by exploiting temporal structure. Proceedings of the IEEE international conference on computer vision. 2015:4507-4515.</mixed-citation>
      </ref>
      <ref id="cit22">
        <label>22</label>
        <mixed-citation xml:lang="ru">Hori C., Hori T., Lee T. Y., Zhang Z., Harsham B., Hershey J. R., Sumi K. Attention-based multimodal fusion for video description. Proceedings of the IEEE international conference on computer vision. 2017:4193-4202.</mixed-citation>
      </ref>
      <ref id="cit23">
        <label>23</label>
        <mixed-citation xml:lang="ru">Yue-Hei Ng, J., Hausknecht M., Vijayanarasimhan S., Vinyals O., Monga R., Toderici G. Beyond short snippets: Deep networks for video classification. Proceedings of the IEEE conference on computer vision and pattern recognition. 2015:4694-4702.</mixed-citation>
      </ref>
      <ref id="cit24">
        <label>24</label>
        <mixed-citation xml:lang="ru">Ullah A., Ahmad J., Muhammad K., Sajjad M., Baik S. W. Action recognition in video sequences using deep bi-directional LSTM with CNN features. IEEE Access. 2017;6:1155-1166.</mixed-citation>
      </ref>
      <ref id="cit25">
        <label>25</label>
        <mixed-citation xml:lang="ru">Girshick R. Fast r-cnn. Proceedings of the IEEE international conference on computer vision. 2015:1440-1448.</mixed-citation>
      </ref>
      <ref id="cit26">
        <label>26</label>
        <mixed-citation xml:lang="ru">Ren S., He K., Girshick R., Sun, J. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems. 2015:91-99.</mixed-citation>
      </ref>
      <ref id="cit27">
        <label>27</label>
        <mixed-citation xml:lang="ru">Redmon J., Divvala S., Girshick R., Farhadi A. You only look once: Unified, real-time object detection. Proceedings of the IEEE conference on computer vision and pattern recognition. 2016:779-788.</mixed-citation>
      </ref>
      <ref id="cit28">
        <label>28</label>
        <mixed-citation xml:lang="ru">Liu W., Anguelov D., Erhan D., Szegedy C., Reed S., Fu C.Y., Berg A.C. Ssd: Single shot multibox detector. European conference on computer vision. Springer, Cham, 2016:21-37.</mixed-citation>
      </ref>
      <ref id="cit29">
        <label>29</label>
        <mixed-citation xml:lang="ru">Pan S.J., Yang Q. A survey on transfer learning. IEEE Transactions on knowledge and data engineering. 2009;22(10):1345-1359.</mixed-citation>
      </ref>
      <ref id="cit30">
        <label>30</label>
        <mixed-citation xml:lang="ru">Weiss K., Khoshgoftaar T.M., Wang D.D. A survey of transfer learning. Journal of Big data. 2016;3(1):9.</mixed-citation>
      </ref>
      <ref id="cit31">
        <label>31</label>
        <mixed-citation xml:lang="ru">He K., Zhang X., Ren S., Sun J. Deep residual learning for image recognition. Proceedings of the IEEE conference on computer vision and pattern recognition. 2016:770-778.</mixed-citation>
      </ref>
      <ref id="cit32">
        <label>32</label>
        <mixed-citation xml:lang="ru">Hochreiter S., Schmidhuber J. Long short-term memory. Neural computation. 1997;9(8):1735-1780.</mixed-citation>
      </ref>
      <ref id="cit33">
        <label>33</label>
        <mixed-citation xml:lang="ru">Chung J., Gulcehre C., Cho K., Bengio Y. Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555. 2014.</mixed-citation>
      </ref>
      <ref id="cit34">
        <label>34</label>
        <mixed-citation xml:lang="ru">Tran D., Bourdev L., Fergus R., Torresani L., Paluri M. Learning spatiotemporal features with 3d convolutional networks. Proceedings of the IEEE international conference on computer vision. 2015:4489-4497.</mixed-citation>
      </ref>
      <ref id="cit35">
        <label>35</label>
        <mixed-citation xml:lang="ru">Hara K., Kataoka H., Satoh Y. Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet? Proceedings of the IEEE conference on Computer Vision and Pattern Recognition. 2018:6546-6555.</mixed-citation>
      </ref>
      <ref id="cit36">
        <label>36</label>
        <mixed-citation xml:lang="ru">Saveliev A., Uzdiaev M., Dmitrii M. Aggressive Action Recognition Using 3D CNN Architectures. 2019 12th International Conference on Developments in eSystems Engineering (DeSE). IEEE. 2019:890-895.</mixed-citation>
      </ref>
      <ref id="cit37">
        <label>37</label>
        <mixed-citation xml:lang="ru">Kay W., Carreira J., Simonyan K., Zhang B., Hillier C., Vijayanarasimhan S., Suleyman M. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950. 2017.</mixed-citation>
      </ref>
      <ref id="cit38">
        <label>38</label>
        <mixed-citation xml:lang="ru">Szegedy C., Liu W., Jia Y., Sermanet P., Reed S., Anguelov D., Rabinovich A. Going deeper with convolutions. Proceedings of the IEEE conference on computer vision and pattern recognition. 2015:1-9.</mixed-citation>
      </ref>
      <ref id="cit39">
        <label>39</label>
        <mixed-citation xml:lang="ru">Gunes H., Piccardi M. A bimodal face and body gesture database for automatic analysis of human nonverbal affective behavior. 18th International Conference on Pattern Recognition (ICPR'06). IEEE. 2006;1:1148-1153.</mixed-citation>
      </ref>
      <ref id="cit40">
        <label>40</label>
        <mixed-citation xml:lang="ru">Kingma D. P., Ba J. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980. 2014.</mixed-citation>
      </ref>
      <ref id="cit41">
        <label>41</label>
        <mixed-citation xml:lang="ru">Gunes H., Piccardi M. Automatic temporal segment detection and affect recognition from face and body display. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics). 2008;39(1):64-84.</mixed-citation>
      </ref>
      <ref id="cit42">
        <label>42</label>
        <mixed-citation xml:lang="ru">Chen S., Tian Y., Liu Q., Metaxas D.N. Recognizing expressions from face and body gesture by temporal normalized motion and appearance features. Image and Vision Computing. 2013;31(2):175-185.</mixed-citation>
      </ref>
      <ref id="cit43">
        <label>43</label>
        <mixed-citation xml:lang="ru">Barros P., Jirak D., Weber C., Wermter S. Multimodal emotional state recognition using sequence-dependent deep hierarchical features. Neural Networks. 2015;72:140-151.</mixed-citation>
      </ref>
      <ref id="cit44">
        <label>44</label>
        <mixed-citation xml:lang="ru">Bahdanau D., Cho K., Bengio Y. Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473. 2014.</mixed-citation>
      </ref>
    </ref-list>
    <fn-group>
      <fn fn-type="conflict">
        <p>The authors declare that there are no conflicts of interest present.</p>
      </fn>
    </fn-group>
  </back>
</article>