add service
All checks were successful
Deploy Smart Search Backend Test / deploy (push) Successful in 1m44s

This commit is contained in:
vallyenfail
2026-01-20 14:26:27 +03:00
parent f2574a8838
commit 7e73144486
8 changed files with 325 additions and 25 deletions

113
pkg/fileparser/parser.go Normal file
View File

@@ -0,0 +1,113 @@
package fileparser
import (
"archive/zip"
"bytes"
"encoding/xml"
"io"
"net/http"
"strings"
"git.techease.ru/Smart-search/smart-search-back/pkg/errors"
)
func ExtractText(data []byte, _ string) (string, error) {
if len(data) == 0 {
return "", nil
}
mimeType := http.DetectContentType(data)
switch {
case strings.HasPrefix(mimeType, "text/"):
return string(data), nil
case mimeType == "application/zip" || mimeType == "application/octet-stream":
if isDocx(data) {
return extractDocx(data)
}
return "", errors.NewBusinessError(errors.UnsupportedFileFormat, "поддерживаются только текстовые файлы (.txt) и документы Word (.docx)")
default:
return "", errors.NewBusinessError(errors.UnsupportedFileFormat, "неподдерживаемый формат файла: "+mimeType+", поддерживаются .txt и .docx")
}
}
func isDocx(data []byte) bool {
reader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return false
}
for _, file := range reader.File {
if file.Name == "word/document.xml" {
return true
}
}
return false
}
func extractDocx(data []byte) (string, error) {
reader, err := zip.NewReader(bytes.NewReader(data), int64(len(data)))
if err != nil {
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось прочитать docx файл", err)
}
var content string
for _, file := range reader.File {
if file.Name == "word/document.xml" {
rc, err := file.Open()
if err != nil {
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось открыть содержимое документа", err)
}
defer func() { _ = rc.Close() }()
xmlData, err := io.ReadAll(rc)
if err != nil {
return "", errors.NewInternalError(errors.FileProcessingError, "не удалось прочитать содержимое документа", err)
}
content = extractTextFromXML(xmlData)
break
}
}
return content, nil
}
type docxDocument struct {
XMLName xml.Name `xml:"document"`
Body docxBody `xml:"body"`
}
type docxBody struct {
Paragraphs []docxParagraph `xml:"p"`
}
type docxParagraph struct {
Runs []docxRun `xml:"r"`
}
type docxRun struct {
Text string `xml:"t"`
}
func extractTextFromXML(data []byte) string {
var doc docxDocument
if err := xml.Unmarshal(data, &doc); err != nil {
return ""
}
var result []string
for _, p := range doc.Body.Paragraphs {
var line []string
for _, r := range p.Runs {
if r.Text != "" {
line = append(line, r.Text)
}
}
if len(line) > 0 {
result = append(result, strings.Join(line, ""))
}
}
return strings.Join(result, "\n")
}

View File

@@ -0,0 +1,110 @@
package fileparser
import (
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestExtractText_EmptyData(t *testing.T) {
result, err := ExtractText(nil, "test.txt")
assert.NoError(t, err)
assert.Empty(t, result)
result, err = ExtractText([]byte{}, "test.txt")
assert.NoError(t, err)
assert.Empty(t, result)
}
func TestExtractText_PlainText(t *testing.T) {
content := "Тестовый текст для проверки"
result, err := ExtractText([]byte(content), "document.txt")
assert.NoError(t, err)
assert.Equal(t, content, result)
}
func TestExtractText_PlainTextWithNewlines(t *testing.T) {
content := "Первая строка\nВторая строка\nТретья строка"
result, err := ExtractText([]byte(content), "document.txt")
assert.NoError(t, err)
assert.Equal(t, content, result)
}
func TestExtractText_RealDocxFile(t *testing.T) {
testdataPath := filepath.Join("testdata", "test_document.docx")
data, err := os.ReadFile(testdataPath)
require.NoError(t, err, "не удалось прочитать тестовый файл")
result, err := ExtractText(data, "тестовый.docx")
assert.NoError(t, err)
assert.NotEmpty(t, result, "текст из docx не должен быть пустым")
t.Logf("Извлеченный текст из docx:\n%s", result)
}
func TestExtractText_DocxWithAnyFilename(t *testing.T) {
testdataPath := filepath.Join("testdata", "test_document.docx")
data, err := os.ReadFile(testdataPath)
require.NoError(t, err)
result1, err := ExtractText(data, "random_name_without_extension")
assert.NoError(t, err)
assert.NotEmpty(t, result1)
result2, err := ExtractText(data, "document.pdf")
assert.NoError(t, err)
assert.NotEmpty(t, result2)
assert.Equal(t, result1, result2, "результат должен быть одинаковым независимо от имени файла")
}
func TestExtractText_UnsupportedFormat_PDF(t *testing.T) {
pdfHeader := []byte("%PDF-1.4\n")
result, err := ExtractText(pdfHeader, "document.pdf")
assert.Error(t, err)
assert.Empty(t, result)
assert.Contains(t, err.Error(), "UNSUPPORTED_FILE_FORMAT")
}
func TestExtractText_UnsupportedFormat_Image(t *testing.T) {
pngHeader := []byte{0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A}
result, err := ExtractText(pngHeader, "image.png")
assert.Error(t, err)
assert.Empty(t, result)
assert.Contains(t, err.Error(), "UNSUPPORTED_FILE_FORMAT")
}
func TestExtractText_InvalidDocx(t *testing.T) {
zipHeader := []byte{0x50, 0x4B, 0x03, 0x04}
fakeZip := append(zipHeader, []byte("not a valid zip content")...)
result, err := ExtractText(fakeZip, "fake.docx")
assert.Error(t, err)
assert.Empty(t, result)
}
func TestIsDocx_ValidDocx(t *testing.T) {
testdataPath := filepath.Join("testdata", "test_document.docx")
data, err := os.ReadFile(testdataPath)
require.NoError(t, err)
assert.True(t, isDocx(data))
}
func TestIsDocx_RegularZip(t *testing.T) {
zipHeader := []byte{0x50, 0x4B, 0x03, 0x04}
assert.False(t, isDocx(zipHeader))
}
func TestIsDocx_NotZip(t *testing.T) {
textData := []byte("plain text content")
assert.False(t, isDocx(textData))
}

Binary file not shown.